From 89df92219b45eb596e6b6508295b4218dc0a3519 Mon Sep 17 00:00:00 2001 From: Viktor Szakats Date: Tue, 20 Oct 2009 20:41:05 +0000 Subject: [PATCH] 2009-10-20 22:40 UTC+0200 Viktor Szakats (harbour.01 syenar.hu) * external/pcre/pcretryf.c * external/pcre/pcre.h * external/pcre/pcreexec.c * external/pcre/pcreinal.h * external/pcre/pcredfa.c * external/pcre/pcrecomp.c * external/pcre/config.h * external/pcre/LICENCE * external/pcre/pcreucd.c * external/pcre/pcreprni.h * external/pcre/pcrestud.c * external/pcre/pcrefinf.c + Updated pcre to 8.0 (from 7.9) * external/pcre/cnv_o2hb.bat * external/pcre/cnv_hb2o.bat ! Minor fix. --- harbour/ChangeLog | 19 + harbour/external/pcre/LICENCE | 4 +- harbour/external/pcre/cnv_hb2o.bat | 2 +- harbour/external/pcre/cnv_o2hb.bat | 2 +- harbour/external/pcre/config.h | 12 +- harbour/external/pcre/pcre.h | 12 +- harbour/external/pcre/pcrecomp.c | 302 +++++-- harbour/external/pcre/pcredfa.c | 321 +++++-- harbour/external/pcre/pcreexec.c | 1316 ++++++++++++++++++++++------ harbour/external/pcre/pcrefinf.c | 11 +- harbour/external/pcre/pcreinal.h | 119 ++- harbour/external/pcre/pcreprni.h | 13 + harbour/external/pcre/pcrestud.c | 424 ++++++++- harbour/external/pcre/pcretryf.c | 6 +- harbour/external/pcre/pcreucd.c | 18 + 15 files changed, 2076 insertions(+), 505 deletions(-) diff --git a/harbour/ChangeLog b/harbour/ChangeLog index a7ab2c26ec..9ce4407641 100644 --- a/harbour/ChangeLog +++ b/harbour/ChangeLog @@ -17,6 +17,25 @@ past entries belonging to author(s): Viktor Szakats. */ +2009-10-20 22:40 UTC+0200 Viktor Szakats (harbour.01 syenar.hu) + * external/pcre/pcretryf.c + * external/pcre/pcre.h + * external/pcre/pcreexec.c + * external/pcre/pcreinal.h + * external/pcre/pcredfa.c + * external/pcre/pcrecomp.c + * external/pcre/config.h + * external/pcre/LICENCE + * external/pcre/pcreucd.c + * external/pcre/pcreprni.h + * external/pcre/pcrestud.c + * external/pcre/pcrefinf.c + + Updated pcre to 8.0 (from 7.9) + + * external/pcre/cnv_o2hb.bat + * external/pcre/cnv_hb2o.bat + ! Minor fix. + 2009-10-20 21:35 UTC+0200 Viktor Szakats (harbour.01 syenar.hu) * contrib/rddads/adsfunc.c * contrib/rddads/rddads.h diff --git a/harbour/external/pcre/LICENCE b/harbour/external/pcre/LICENCE index 03fabc6aef..73f8cde3d8 100644 --- a/harbour/external/pcre/LICENCE +++ b/harbour/external/pcre/LICENCE @@ -4,7 +4,7 @@ PCRE LICENCE PCRE is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. -Release 7 of PCRE is distributed under the terms of the "BSD" licence, as +Release 8 of PCRE is distributed under the terms of the "BSD" licence, as specified below. The documentation for PCRE, supplied in the "doc" directory, is distributed under the same terms as the software itself. @@ -22,7 +22,7 @@ Email domain: cam.ac.uk University of Cambridge Computing Service, Cambridge, England. -Copyright (c) 1997-2008 University of Cambridge +Copyright (c) 1997-2009 University of Cambridge All rights reserved. diff --git a/harbour/external/pcre/cnv_hb2o.bat b/harbour/external/pcre/cnv_hb2o.bat index 7be1f7a925..4d596b292f 100644 --- a/harbour/external/pcre/cnv_hb2o.bat +++ b/harbour/external/pcre/cnv_hb2o.bat @@ -21,7 +21,7 @@ rem don't have to mess with this tool. md ori_dst del ori_dst\*.* /Y -copy LICENSE ori_dst\LICENSE +copy LICENCE ori_dst\LICENCE copy config.h ori_dst\config.h.generic copy pcre.h ori_dst\pcre.h.generic copy pcreinal.h ori_dst\pcre_internal.h diff --git a/harbour/external/pcre/cnv_o2hb.bat b/harbour/external/pcre/cnv_o2hb.bat index 0ecc4a5007..7e8a2c50b3 100644 --- a/harbour/external/pcre/cnv_o2hb.bat +++ b/harbour/external/pcre/cnv_o2hb.bat @@ -21,7 +21,7 @@ rem DISCLAIMER: This tool is targeted only to Harbour core rem maintainers. If you're not one of them you rem don't have to mess with this tool. -copy ori_src\LICENSE LICENSE +copy ori_src\LICENCE LICENCE copy ori_src\config.h.generic config.h copy ori_src\pcre.h.generic pcre.h copy ori_src\pcre_internal.h pcreinal.h diff --git a/harbour/external/pcre/config.h b/harbour/external/pcre/config.h index 7bfcdb858c..6a3d949180 100644 --- a/harbour/external/pcre/config.h +++ b/harbour/external/pcre/config.h @@ -175,6 +175,12 @@ them both to 0; an emulation function will be used. */ #define LINK_SIZE 2 #endif +/* Define to the sub-directory in which libtool stores uninstalled libraries. + */ +#ifndef LT_OBJDIR +#define LT_OBJDIR ".libs/" +#endif + /* The value of MATCH_LIMIT determines the default number of times the internal match() function can be called during a single execution of pcre_exec(). There is a runtime interface for setting a different limit. @@ -241,13 +247,13 @@ them both to 0; an emulation function will be used. */ #define PACKAGE_NAME "PCRE" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "PCRE 7.9" +#define PACKAGE_STRING "PCRE 8.00" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "pcre" /* Define to the version of this package. */ -#define PACKAGE_VERSION "7.9" +#define PACKAGE_VERSION "8.00" /* If you are compiling for a system other than a Unix-like system or @@ -303,7 +309,7 @@ them both to 0; an emulation function will be used. */ /* Version number of package */ #ifndef VERSION -#define VERSION "7.9" +#define VERSION "8.00" #endif /* Define to empty if `const' does not conform to ANSI C. */ diff --git a/harbour/external/pcre/pcre.h b/harbour/external/pcre/pcre.h index c5fc4c13e4..93dff102ac 100644 --- a/harbour/external/pcre/pcre.h +++ b/harbour/external/pcre/pcre.h @@ -41,10 +41,10 @@ POSSIBILITY OF SUCH DAMAGE. /* The current PCRE version information. */ -#define PCRE_MAJOR 7 -#define PCRE_MINOR 9 +#define PCRE_MAJOR 8 +#define PCRE_MINOR 00 #define PCRE_PRERELEASE -#define PCRE_DATE 2009-04-11 +#define PCRE_DATE 2009-10-19 /* When an application links to a PCRE DLL in Windows, the symbols that are imported have to be identified as such. When building PCRE, the appropriate @@ -113,7 +113,8 @@ both, so we keep them all distinct. */ #define PCRE_NO_AUTO_CAPTURE 0x00001000 #define PCRE_NO_UTF8_CHECK 0x00002000 #define PCRE_AUTO_CALLOUT 0x00004000 -#define PCRE_PARTIAL 0x00008000 +#define PCRE_PARTIAL_SOFT 0x00008000 +#define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */ #define PCRE_DFA_SHORTEST 0x00010000 #define PCRE_DFA_RESTART 0x00020000 #define PCRE_FIRSTLINE 0x00040000 @@ -128,6 +129,8 @@ both, so we keep them all distinct. */ #define PCRE_JAVASCRIPT_COMPAT 0x02000000 #define PCRE_NO_START_OPTIMIZE 0x04000000 #define PCRE_NO_START_OPTIMISE 0x04000000 +#define PCRE_PARTIAL_HARD 0x08000000 +#define PCRE_NOTEMPTY_ATSTART 0x10000000 /* Exec-time and get/set-time error codes */ @@ -174,6 +177,7 @@ both, so we keep them all distinct. */ #define PCRE_INFO_OKPARTIAL 12 #define PCRE_INFO_JCHANGED 13 #define PCRE_INFO_HASCRORLF 14 +#define PCRE_INFO_MINLENGTH 15 /* Request types for pcre_config(). Do not re-arrange, in order to remain compatible. */ diff --git a/harbour/external/pcre/pcrecomp.c b/harbour/external/pcre/pcrecomp.c index 81948e645b..4c91b81c17 100644 --- a/harbour/external/pcre/pcrecomp.c +++ b/harbour/external/pcre/pcrecomp.c @@ -341,7 +341,9 @@ static const char error_texts[] = "number is too big\0" "subpattern name expected\0" "digit expected after (?+\0" - "] is an invalid data character in JavaScript compatibility mode"; + "] is an invalid data character in JavaScript compatibility mode\0" + /* 65 */ + "different names for subpatterns of the same number are not allowed"; /* Table to identify digits and hex digits. This is used when compiling @@ -1100,6 +1102,7 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS) if (name != NULL && lorn == ptr - thisname && strncmp((const char *)name, (const char *)thisname, lorn) == 0) return *count; + term++; } } } @@ -1134,19 +1137,21 @@ for (; *ptr != 0; ptr++) BOOL negate_class = FALSE; for (;;) { - int c = *(++ptr); - if (c == CHAR_BACKSLASH) + if (ptr[1] == CHAR_BACKSLASH) { - if (ptr[1] == CHAR_E) - ptr++; - else if (strncmp((const char *)ptr+1, + if (ptr[2] == CHAR_E) + ptr+= 2; + else if (strncmp((const char *)ptr+2, STR_Q STR_BACKSLASH STR_E, 3) == 0) - ptr += 3; + ptr += 4; else break; } - else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) + else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT) + { negate_class = TRUE; + ptr++; + } else break; } @@ -1312,7 +1317,9 @@ for (;;) case OP_CALLOUT: case OP_CREF: + case OP_NCREF: case OP_RREF: + case OP_NRREF: case OP_DEF: code += _pcre_OP_lengths[*code]; break; @@ -1328,23 +1335,34 @@ for (;;) /************************************************* -* Find the fixed length of a pattern * +* Find the fixed length of a branch * *************************************************/ -/* Scan a pattern and compute the fixed length of subject that will match it, +/* Scan a branch and compute the fixed length of subject that will match it, if the length is fixed. This is needed for dealing with backward assertions. -In UTF8 mode, the result is in characters rather than bytes. +In UTF8 mode, the result is in characters rather than bytes. The branch is +temporarily terminated with OP_END when this function is called. + +This function is called when a backward assertion is encountered, so that if it +fails, the error message can point to the correct place in the pattern. +However, we cannot do this when the assertion contains subroutine calls, +because they can be forward references. We solve this by remembering this case +and doing the check at the end; a flag specifies which mode we are running in. Arguments: code points to the start of the pattern (the bracket) options the compiling options + atend TRUE if called when the pattern is complete + cd the "compile data" structure -Returns: the fixed length, or -1 if there is no fixed length, +Returns: the fixed length, + or -1 if there is no fixed length, or -2 if \C was encountered + or -3 if an OP_RECURSE item was encountered and atend is FALSE */ static int -find_fixedlength(uschar *code, int options) +find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd) { int length = -1; @@ -1357,6 +1375,7 @@ branch, check the length against that of the other branches. */ for (;;) { int d; + uschar *ce, *cs; register int op = *cc; switch (op) { @@ -1364,7 +1383,7 @@ for (;;) case OP_BRA: case OP_ONCE: case OP_COND: - d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options); + d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd); if (d < 0) return d; branchlength += d; do cc += GET(cc, 1); while (*cc == OP_ALT); @@ -1387,6 +1406,21 @@ for (;;) branchlength = 0; break; + /* A true recursion implies not fixed length, but a subroutine call may + be OK. If the subroutine is a forward reference, we can't deal with + it until the end of the pattern, so return -3. */ + + case OP_RECURSE: + if (!atend) return -3; + cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */ + do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ + if (cc > cs && cc < ce) return -1; /* Recursion */ + d = find_fixedlength(cs + 2, options, atend, cd); + if (d < 0) return d; + branchlength += d; + cc += 1 + LINK_SIZE; + break; + /* Skip over assertive subpatterns */ case OP_ASSERT: @@ -1400,7 +1434,9 @@ for (;;) case OP_REVERSE: case OP_CREF: + case OP_NCREF: case OP_RREF: + case OP_NRREF: case OP_DEF: case OP_OPT: case OP_CALLOUT: @@ -1423,10 +1459,8 @@ for (;;) branchlength++; cc += 2; #ifdef SUPPORT_UTF8 - if ((options & PCRE_UTF8) != 0) - { - while ((*cc & 0xc0) == 0x80) cc++; - } + if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0) + cc += _pcre_utf8_table4[cc[-1] & 0x3f]; #endif break; @@ -1437,10 +1471,8 @@ for (;;) branchlength += GET2(cc,1); cc += 4; #ifdef SUPPORT_UTF8 - if ((options & PCRE_UTF8) != 0) - { - while((*cc & 0x80) == 0x80) cc++; - } + if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0) + cc += _pcre_utf8_table4[cc[-1] & 0x3f]; #endif break; @@ -1519,22 +1551,25 @@ for (;;) /************************************************* -* Scan compiled regex for numbered bracket * +* Scan compiled regex for specific bracket * *************************************************/ /* This little function scans through a compiled pattern until it finds a -capturing bracket with the given number. +capturing bracket with the given number, or, if the number is negative, an +instance of OP_REVERSE for a lookbehind. The function is global in the C sense +so that it can be called from pcre_study() when finding the minimum matching +length. Arguments: code points to start of expression utf8 TRUE in UTF-8 mode - number the required bracket number + number the required bracket number or negative to find a lookbehind Returns: pointer to the opcode for the bracket, or NULL if not found */ -static const uschar * -find_bracket(const uschar *code, BOOL utf8, int number) +const uschar * +_pcre_find_bracket(const uschar *code, BOOL utf8, int number) { for (;;) { @@ -1547,6 +1582,14 @@ for (;;) if (c == OP_XCLASS) code += GET(code, 1); + /* Handle recursion */ + + else if (c == OP_REVERSE) + { + if (number < 0) return (uschar *)code; + code += _pcre_OP_lengths[c]; + } + /* Handle capturing bracket */ else if (c == OP_CBRA) @@ -1912,10 +1955,13 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: + if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f]; + break; + case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: - if (utf8) while ((code[2] & 0xc0) == 0x80) code++; + if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f]; break; #endif } @@ -3869,10 +3915,15 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (repeat_max == 0) goto END_REPEAT; + /*--------------------------------------------------------------------*/ + /* This code is obsolete from release 8.00; the restriction was finally + removed: */ + /* All real repeats make it impossible to handle partial matching (maybe one day we will be able to remove this restriction). */ - if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; + /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */ + /*--------------------------------------------------------------------*/ /* Combine the op_type with the repeat_type */ @@ -4019,10 +4070,15 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ goto END_REPEAT; } + /*--------------------------------------------------------------------*/ + /* This code is obsolete from release 8.00; the restriction was finally + removed: */ + /* All real repeats make it impossible to handle partial matching (maybe one day we will be able to remove this restriction). */ - if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; + /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */ + /*--------------------------------------------------------------------*/ if (repeat_min == 0 && repeat_max == -1) *code++ = OP_CRSTAR + repeat_type; @@ -4337,11 +4393,20 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (possessive_quantifier) { int len; - if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT || - *tempcode == OP_NOTEXACT) + + if (*tempcode == OP_TYPEEXACT) tempcode += _pcre_OP_lengths[*tempcode] + - ((*tempcode == OP_TYPEEXACT && - (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0); + ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0); + + else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT) + { + tempcode += _pcre_OP_lengths[*tempcode]; +#ifdef SUPPORT_UTF8 + if (utf8 && tempcode[-1] >= 0xc0) + tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f]; +#endif + } + len = code - tempcode; if (len > 0) switch (*tempcode) { @@ -4419,8 +4484,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (namelen == verbs[i].len && strncmp((char *)name, vn, namelen) == 0) { - *code = verbs[i].op; - if (*code++ == OP_ACCEPT) cd->had_accept = TRUE; + /* Check for open captures before ACCEPT */ + + if (verbs[i].op == OP_ACCEPT) + { + open_capitem *oc; + cd->had_accept = TRUE; + for (oc = cd->open_caps; oc != NULL; oc = oc->next) + { + *code++ = OP_CLOSE; + PUT2INC(code, 0, oc->number); + } + } + *code++ = verbs[i].op; break; } vn += verbs[i].len + 1; @@ -4582,7 +4658,10 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ } /* Otherwise (did not start with "+" or "-"), start by looking for the - name. */ + name. If we find a name, add one to the opcode to change OP_CREF or + OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same, + except they record that the reference was originally to a name. The + information is used to check duplicate names. */ slot = cd->name_table; for (i = 0; i < cd->names_found; i++) @@ -4597,6 +4676,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { recno = GET2(slot, 0); PUT2(code, 2+LINK_SIZE, recno); + code[1+LINK_SIZE]++; } /* Search the pattern for a forward reference */ @@ -4605,6 +4685,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ (options & PCRE_EXTENDED) != 0)) > 0) { PUT2(code, 2+LINK_SIZE, i); + code[1+LINK_SIZE]++; } /* If terminator == 0 it means that the name followed directly after @@ -4797,11 +4878,24 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ } } - /* In the real compile, create the entry in the table */ + /* In the real compile, create the entry in the table, maintaining + alphabetical order. Duplicate names for different numbers are + permitted only if PCRE_DUPNAMES is set. Duplicate names for the same + number are always OK. (An existing number can be re-used if (?| + appears in the pattern.) In either event, a duplicate name results in + a duplicate entry in the table, even if the number is the same. This + is because the number of names, and hence the table size, is computed + in the pre-compile, and it affects various numbers and pointers which + would all have to be modified, and the compiled code moved down, if + duplicates with the same number were omitted from the table. This + doesn't seem worth the hassle. However, *different* names for the + same number are not permitted. */ else { + BOOL dupname = FALSE; slot = cd->name_table; + for (i = 0; i < cd->names_found; i++) { int crc = memcmp(name, slot+2, namelen); @@ -4809,33 +4903,66 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { if (slot[2+namelen] == 0) { - if ((options & PCRE_DUPNAMES) == 0) + if (GET2(slot, 0) != cd->bracount + 1 && + (options & PCRE_DUPNAMES) == 0) { *errorcodeptr = ERR43; goto FAILED; } + else dupname = TRUE; } - else crc = -1; /* Current name is substring */ + else crc = -1; /* Current name is a substring */ } + + /* Make space in the table and break the loop for an earlier + name. For a duplicate or later name, carry on. We do this for + duplicates so that in the simple case (when ?(| is not used) they + are in order of their numbers. */ + if (crc < 0) { memmove(slot + cd->name_entry_size, slot, (cd->names_found - i) * cd->name_entry_size); break; } + + /* Continue the loop for a later or duplicate name */ + slot += cd->name_entry_size; } + /* For non-duplicate names, check for a duplicate number before + adding the new name. */ + + if (!dupname) + { + uschar *cslot = cd->name_table; + for (i = 0; i < cd->names_found; i++) + { + if (cslot != slot) + { + if (GET2(cslot, 0) == cd->bracount + 1) + { + *errorcodeptr = ERR65; + goto FAILED; + } + } + else i--; + cslot += cd->name_entry_size; + } + } + PUT2(slot, 0, cd->bracount + 1); memcpy(slot + 2, name, namelen); slot[2+namelen] = 0; } } - /* In both cases, count the number of names we've encountered. */ + /* In both pre-compile and compile, count the number of names we've + encountered. */ - ptr++; /* Move past > or ' */ cd->names_found++; + ptr++; /* Move past > or ' */ goto NUMBERED_GROUP; @@ -5004,7 +5131,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (lengthptr == NULL) { *code = OP_END; - if (recno != 0) called = find_bracket(cd->start_code, utf8, recno); + if (recno != 0) + called = _pcre_find_bracket(cd->start_code, utf8, recno); /* Forward reference */ @@ -5648,6 +5776,8 @@ uschar *code = *codeptr; uschar *last_branch = code; uschar *start_bracket = code; uschar *reverse_count = NULL; +open_capitem capitem; +int capnumber = 0; int firstbyte, reqbyte; int branchfirstbyte, branchreqbyte; int length; @@ -5674,6 +5804,17 @@ the code that abstracts option settings at the start of the pattern and makes them global. It tests the value of length for (2 + 2*LINK_SIZE) in the pre-compile phase to find out whether anything has yet been compiled or not. */ +/* If this is a capturing subpattern, add to the chain of open capturing items +so that we can detect them if (*ACCEPT) is encountered. */ + +if (*code == OP_CBRA) + { + capnumber = GET2(code, 1 + LINK_SIZE); + capitem.number = capnumber; + capitem.next = cd->open_caps; + cd->open_caps = &capitem; + } + /* Offset is set zero to mark that this bracket is still open */ PUT(code, 1, 0); @@ -5768,21 +5909,29 @@ for (;;) /* If lookbehind, check that this branch matches a fixed-length string, and put the length into the OP_REVERSE item. Temporarily mark the end of the - branch with OP_END. */ + branch with OP_END. If the branch contains OP_RECURSE, the result is -3 + because there may be forward references that we can't check here. Set a + flag to cause another lookbehind check at the end. Why not do it all at the + end? Because common, erroneous checks are picked up here and the offset of + the problem can be shown. */ if (lookbehind) { int fixed_length; *code = OP_END; - fixed_length = find_fixedlength(last_branch, options); + fixed_length = find_fixedlength(last_branch, options, FALSE, cd); DPRINTF(("fixed length = %d\n", fixed_length)); - if (fixed_length < 0) + if (fixed_length == -3) + { + cd->check_lookbehind = TRUE; + } + else if (fixed_length < 0) { *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25; *ptrptr = ptr; return FALSE; } - PUT(reverse_count, 0, fixed_length); + else { PUT(reverse_count, 0, fixed_length); } } } @@ -5810,6 +5959,10 @@ for (;;) while (branch_length > 0); } + /* If it was a capturing subpattern, remove it from the chain. */ + + if (capnumber > 0) cd->open_caps = cd->open_caps->next; + /* Fill in the ket */ *code = OP_KET; @@ -6012,7 +6165,9 @@ do { switch (*scode) { case OP_CREF: + case OP_NCREF: case OP_RREF: + case OP_NRREF: case OP_DEF: return FALSE; @@ -6181,9 +6336,7 @@ int length = 1; /* For final END opcode */ int firstbyte, reqbyte, newline; int errorcode = 0; int skipatstart = 0; -#ifdef SUPPORT_UTF8 -BOOL utf8; -#endif +BOOL utf8 = (options & PCRE_UTF8) != 0; size_t size; uschar *code; const uschar *codestart; @@ -6280,7 +6433,6 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && /* Can't support UTF8 unless PCRE has been compiled to include the code. */ #ifdef SUPPORT_UTF8 -utf8 = (options & PCRE_UTF8) != 0; if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) { @@ -6288,7 +6440,7 @@ if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && goto PCRE_EARLY_ERROR_RETURN2; } #else -if ((options & PCRE_UTF8) != 0) +if (utf8) { errorcode = ERR32; goto PCRE_EARLY_ERROR_RETURN; @@ -6377,6 +6529,7 @@ cd->end_pattern = (const uschar *)(pattern + strlen(pattern)); cd->req_varyopt = 0; cd->external_options = options; cd->external_flags = 0; +cd->open_caps = NULL; /* Now do the pre-compile. On error, errorcode will be set non-zero, so we don't need to look at the result of the function here. The initial options have @@ -6451,6 +6604,8 @@ cd->start_code = codestart; cd->hwm = cworkspace; cd->req_varyopt = 0; cd->had_accept = FALSE; +cd->check_lookbehind = FALSE; +cd->open_caps = NULL; /* Set up a starting, non-extracting bracket, then compile the expression. On error, errorcode will be set non-zero, so we don't need to look at the result @@ -6489,7 +6644,7 @@ while (errorcode == 0 && cd->hwm > cworkspace) cd->hwm -= LINK_SIZE; offset = GET(cd->hwm, 0); recno = GET(codestart, offset); - groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno); + groupptr = _pcre_find_bracket(codestart, utf8, recno); if (groupptr == NULL) errorcode = ERR53; else PUT(((uschar *)codestart), offset, groupptr - codestart); } @@ -6499,6 +6654,47 @@ subpattern. */ if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; +/* If there were any lookbehind assertions that contained OP_RECURSE +(recursions or subroutine calls), a flag is set for them to be checked here, +because they may contain forward references. Actual recursions can't be fixed +length, but subroutine calls can. It is done like this so that those without +OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The +exceptional ones forgo this. We scan the pattern to check that they are fixed +length, and set their lengths. */ + +if (cd->check_lookbehind) + { + uschar *cc = (uschar *)codestart; + + /* Loop, searching for OP_REVERSE items, and process those that do not have + their length set. (Actually, it will also re-process any that have a length + of zero, but that is a pathological case, and it does no harm.) When we find + one, we temporarily terminate the branch it is in while we scan it. */ + + for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1); + cc != NULL; + cc = (uschar *)_pcre_find_bracket(cc, utf8, -1)) + { + if (GET(cc, 1) == 0) + { + int fixed_length; + uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE); + int end_op = *be; + *be = OP_END; + fixed_length = find_fixedlength(cc, re->options, TRUE, cd); + *be = end_op; + DPRINTF(("fixed length = %d\n", fixed_length)); + if (fixed_length < 0) + { + errorcode = (fixed_length == -2)? ERR36 : ERR25; + break; + } + PUT(cc, 1, fixed_length); + } + cc += 1 + LINK_SIZE; + } + } + /* Failed to compile, or error while post-processing */ if (errorcode != 0) diff --git a/harbour/external/pcre/pcredfa.c b/harbour/external/pcre/pcredfa.c index d923d74da6..30f43a9c02 100644 --- a/harbour/external/pcre/pcredfa.c +++ b/harbour/external/pcre/pcredfa.c @@ -45,6 +45,34 @@ FSM). This is NOT Perl- compatible, but it has advantages in certain applications. */ +/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved +the performance of his patterns greatly. I could not use it as it stood, as it +was not thread safe, and made assumptions about pattern sizes. Also, it caused +test 7 to loop, and test 9 to crash with a segfault. + +The issue is the check for duplicate states, which is done by a simple linear +search up the state list. (Grep for "duplicate" below to find the code.) For +many patterns, there will never be many states active at one time, so a simple +linear search is fine. In patterns that have many active states, it might be a +bottleneck. The suggested code used an indexing scheme to remember which states +had previously been used for each character, and avoided the linear search when +it knew there was no chance of a duplicate. This was implemented when adding +states to the state lists. + +I wrote some thread-safe, not-limited code to try something similar at the time +of checking for duplicates (instead of when adding states), using index vectors +on the stack. It did give a 13% improvement with one specially constructed +pattern for certain subject strings, but on other strings and on many of the +simpler patterns in the test suite it did worse. The major problem, I think, +was the extra time to initialize the index. This had to be done for each call +of internal_dfa_exec(). (The supplied patch used a static vector, initialized +only once - I suspect this was the cause of the problems with the tests.) + +Overall, I concluded that the gains in some cases did not outweigh the losses +in others, so I abandoned this code. */ + + + #ifdef HAVE_CONFIG_H #include "config.h" #endif @@ -81,8 +109,9 @@ never stored, so we push them well clear of the normal opcodes. */ character that is to be tested in some way. This makes is possible to centralize the loading of these characters. In the case of Type * etc, the "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a -small value. ***NOTE*** If the start of this table is modified, the two tables -that follow must also be modified. */ +small value. Non-zero values in the table are the offsets from the opcode where +the character is to be found. ***NOTE*** If the start of this table is +modified, the three tables that follow must also be modified. */ static const uschar coptable[] = { 0, /* End */ @@ -132,7 +161,63 @@ static const uschar coptable[] = { 0, /* DEF */ 0, 0, /* BRAZERO, BRAMINZERO */ 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ - 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */ + 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ +}; + +/* This table identifies those opcodes that inspect a character. It is used to +remember the fact that a character could have been inspected when the end of +the subject is reached. ***NOTE*** If the start of this table is modified, the +two tables that follow must also be modified. */ + +static const uschar poptable[] = { + 0, /* End */ + 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */ + 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ + 1, 1, 1, /* Any, AllAny, Anybyte */ + 1, 1, 1, /* NOTPROP, PROP, EXTUNI */ + 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ + 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ + 1, /* Char */ + 1, /* Charnc */ + 1, /* not */ + /* Positive single-char repeats */ + 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ + 1, 1, 1, /* upto, minupto, exact */ + 1, 1, 1, 1, /* *+, ++, ?+, upto+ */ + /* Negative single-char repeats - only for chars < 256 */ + 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ + 1, 1, 1, /* NOT upto, minupto, exact */ + 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */ + /* Positive type repeats */ + 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ + 1, 1, 1, /* Type upto, minupto, exact */ + 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */ + /* Character class & ref repeats */ + 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ + 1, 1, /* CRRANGE, CRMINRANGE */ + 1, /* CLASS */ + 1, /* NCLASS */ + 1, /* XCLASS - variable length */ + 0, /* REF */ + 0, /* RECURSE */ + 0, /* CALLOUT */ + 0, /* Alt */ + 0, /* Ket */ + 0, /* KetRmax */ + 0, /* KetRmin */ + 0, /* Assert */ + 0, /* Assert not */ + 0, /* Assert behind */ + 0, /* Assert behind not */ + 0, /* Reverse */ + 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */ + 0, 0, 0, /* SBRA, SCBRA, SCOND */ + 0, /* CREF */ + 0, /* RREF */ + 0, /* DEF */ + 0, 0, /* BRAZERO, BRAMINZERO */ + 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ + 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ }; /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, @@ -390,6 +475,11 @@ if (*first_op == OP_REVERSE) current_subject -= gone_back; } + /* Save the earliest consulted character */ + + if (current_subject < md->start_used_ptr) + md->start_used_ptr = current_subject; + /* Now we can process the individual branches. */ end_code = this_start_code; @@ -454,6 +544,8 @@ for (;;) int i, j; int clen, dlen; unsigned int c, d; + int forced_fail = 0; + BOOL could_continue = FALSE; /* Make the new state list into the active state list and empty the new state list. */ @@ -543,7 +635,9 @@ for (;;) } } - /* Check for a duplicate state with the same count, and skip if found. */ + /* Check for a duplicate state with the same count, and skip if found. + See the note at the head of this module about the possibility of improving + performance here. */ for (j = 0; j < i; j++) { @@ -560,6 +654,12 @@ for (;;) code = start_code + state_offset; codevalue = *code; + /* If this opcode inspects a character, but we are at the end of the + subject, remember the fact for use when testing for a partial match. */ + + if (clen == 0 && poptable[codevalue] != 0) + could_continue = TRUE; + /* If this opcode is followed by an inline character, load it. It is tempting to test for the presence of a subject character here, but that is wrong, because sometimes zero repetitions of the subject are @@ -610,7 +710,8 @@ for (;;) /* ========================================================================== */ /* Reached a closing bracket. If not at the end of the pattern, carry on with the next opcode. Otherwise, unless we have an empty string and - PCRE_NOTEMPTY is set, save the match data, shifting up all previous + PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the + start of the subject, save the match data, shifting up all previous matches so we always have the longest first. */ case OP_KET: @@ -624,26 +725,32 @@ for (;;) ADD_ACTIVE(state_offset - GET(code, 1), 0); } } - else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0) + else { - if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; - else if (match_count > 0 && ++match_count * 2 >= offsetcount) - match_count = 0; - count = ((match_count == 0)? offsetcount : match_count * 2) - 2; - if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); - if (offsetcount >= 2) + if (ptr > current_subject || + ((md->moptions & PCRE_NOTEMPTY) == 0 && + ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 || + current_subject > start_subject + md->start_offset))) { - offsets[0] = current_subject - start_subject; - offsets[1] = ptr - start_subject; - DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, - offsets[1] - offsets[0], current_subject)); - } - if ((md->moptions & PCRE_DFA_SHORTEST) != 0) - { - DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" - "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, - match_count, rlevel*2-2, SP)); - return match_count; + if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; + else if (match_count > 0 && ++match_count * 2 >= offsetcount) + match_count = 0; + count = ((match_count == 0)? offsetcount : match_count * 2) - 2; + if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); + if (offsetcount >= 2) + { + offsets[0] = current_subject - start_subject; + offsets[1] = ptr - start_subject; + DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, + offsets[1] - offsets[0], current_subject)); + } + if ((md->moptions & PCRE_DFA_SHORTEST) != 0) + { + DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" + "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, + match_count, rlevel*2-2, SP)); + return match_count; + } } } break; @@ -794,6 +901,7 @@ for (;;) if (ptr > start_subject) { const uschar *temp = ptr - 1; + if (temp < md->start_used_ptr) md->start_used_ptr = temp; #ifdef SUPPORT_UTF8 if (utf8) BACKCHAR(temp); #endif @@ -802,8 +910,9 @@ for (;;) } else left_word = 0; - if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0; - else right_word = 0; + if (clen > 0) + right_word = c < 256 && (ctypes[c] & ctype_word) != 0; + else right_word = 0; if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) { ADD_ACTIVE(state_offset + 1, 0); } @@ -2157,11 +2266,12 @@ for (;;) /* ========================================================================== */ /* These are the opcodes for fancy brackets of various kinds. We have - to use recursion in order to handle them. The "always failing" assersion - (?!) is optimised when compiling to OP_FAIL, so we have to support that, + to use recursion in order to handle them. The "always failing" assertion + (?!) is optimised to OP_FAIL when compiling, so we have to support that, though the other "backtracking verbs" are not supported. */ case OP_FAIL: + forced_fail++; /* Count FAILs for multiple states */ break; case OP_ASSERT: @@ -2235,7 +2345,8 @@ for (;;) /* Back reference conditions are not supported */ - if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND; + if (condcode == OP_CREF || condcode == OP_NCREF) + return PCRE_ERROR_DFA_UCOND; /* The DEFINE condition is always false */ @@ -2246,7 +2357,7 @@ for (;;) which means "test if in any recursion". We can't test for specifically recursed groups. */ - else if (condcode == OP_RREF) + else if (condcode == OP_RREF || condcode == OP_NRREF) { int value = GET2(code, LINK_SIZE+2); if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; @@ -2469,19 +2580,33 @@ for (;;) /* We have finished the processing at the current subject character. If no new states have been set for the next character, we have found all the matches that we are going to find. If we are at the top level and partial - matching has been requested, check for appropriate conditions. */ + matching has been requested, check for appropriate conditions. + + The "forced_ fail" variable counts the number of (*F) encountered for the + character. If it is equal to the original active_count (saved in + workspace[1]) it means that (*F) was found on every active state. In this + case we don't want to give a partial match. + + The "could_continue" variable is true if a state could have continued but + for the fact that the end of the subject was reached. */ if (new_count <= 0) { - if (match_count < 0 && /* No matches found */ - rlevel == 1 && /* Top level match function */ - (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */ - ptr >= end_subject && /* Reached end of subject */ - ptr > current_subject) /* Matched non-empty string */ + if (rlevel == 1 && /* Top level, and */ + could_continue && /* Some could go on */ + forced_fail != workspace[1] && /* Not all forced fail & */ + ( /* either... */ + (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */ + || /* or... */ + ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */ + match_count < 0) /* no matches */ + ) && /* And... */ + ptr >= end_subject && /* Reached end of subject */ + ptr > current_subject) /* Matched non-empty string */ { if (offsetcount >= 2) { - offsets[0] = current_subject - start_subject; + offsets[0] = md->start_used_ptr - start_subject; offsets[1] = end_subject - start_subject; } match_count = PCRE_ERROR_PARTIAL; @@ -2623,6 +2748,7 @@ md->start_code = (const uschar *)argument_re + re->name_table_offset + re->name_count * re->name_entry_size; md->start_subject = (const unsigned char *)subject; md->end_subject = end_subject; +md->start_offset = start_offset; md->moptions = options; md->poptions = re->options; @@ -2727,8 +2853,8 @@ if (!anchored) } else { - if (startline && study != NULL && - (study->options & PCRE_STUDY_MAPPED) != 0) + if (!startline && study != NULL && + (study->flags & PCRE_STUDY_MAPPED) != 0) start_bits = study->start_bits; } } @@ -2779,13 +2905,11 @@ for (;;) } /* There are some optimizations that avoid running the match if a known - starting point is not found, or if a known later character is not present. - However, there is an option that disables these, for testing and for - ensuring that all callouts do actually occur. */ + starting point is not found. However, there is an option that disables + these, for testing and for ensuring that all callouts do actually occur. */ if ((options & PCRE_NO_START_OPTIMIZE) == 0) { - /* Advance to a known first byte. */ if (first_byte >= 0) @@ -2851,67 +2975,80 @@ for (;;) /* Restore fudged end_subject */ end_subject = save_end_subject; - } - /* If req_byte is set, we know that that character must appear in the subject - for the match to succeed. If the first character is set, req_byte must be - later in the subject; otherwise the test starts at the match point. This - optimization can save a huge amount of work in patterns with nested unlimited - repeats that aren't going to match. Writing separate code for cased/caseless - versions makes it go faster, as does using an autoincrement and backing off - on a match. + /* The following two optimizations are disabled for partial matching or if + disabling is explicitly requested (and of course, by the test above, this + code is not obeyed when restarting after a partial match). */ - HOWEVER: when the subject string is very, very long, searching to its end can - take a long time, and give bad performance on quite ordinary patterns. This - showed up when somebody was matching /^C/ on a 32-megabyte string... so we - don't do this when the string is sufficiently long. - - ALSO: this processing is disabled when partial matching is requested, and can - also be explicitly deactivated. */ - - if ((options & PCRE_NO_START_OPTIMIZE) == 0 && - req_byte >= 0 && - end_subject - current_subject < REQ_BYTE_MAX && - (options & PCRE_PARTIAL) == 0) - { - register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0); - - /* We don't need to repeat the search if we haven't yet reached the - place we found it at last time. */ - - if (p > req_byte_ptr) + if ((options & PCRE_NO_START_OPTIMIZE) == 0 && + (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0) { - if (req_byte_caseless) + /* If the pattern was studied, a minimum subject length may be set. This + is a lower bound; no actual string of that length may actually match the + pattern. Although the value is, strictly, in characters, we treat it as + bytes to avoid spending too much time in this optimization. */ + + if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && + end_subject - current_subject < study->minlength) + return PCRE_ERROR_NOMATCH; + + /* If req_byte is set, we know that that character must appear in the + subject for the match to succeed. If the first character is set, req_byte + must be later in the subject; otherwise the test starts at the match + point. This optimization can save a huge amount of work in patterns with + nested unlimited repeats that aren't going to match. Writing separate + code for cased/caseless versions makes it go faster, as does using an + autoincrement and backing off on a match. + + HOWEVER: when the subject string is very, very long, searching to its end + can take a long time, and give bad performance on quite ordinary + patterns. This showed up when somebody was matching /^C/ on a 32-megabyte + string... so we don't do this when the string is sufficiently long. */ + + if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX) { - while (p < end_subject) + register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0); + + /* We don't need to repeat the search if we haven't yet reached the + place we found it at last time. */ + + if (p > req_byte_ptr) { - register int pp = *p++; - if (pp == req_byte || pp == req_byte2) { p--; break; } + if (req_byte_caseless) + { + while (p < end_subject) + { + register int pp = *p++; + if (pp == req_byte || pp == req_byte2) { p--; break; } + } + } + else + { + while (p < end_subject) + { + if (*p++ == req_byte) { p--; break; } + } + } + + /* If we can't find the required character, break the matching loop, + which will cause a return or PCRE_ERROR_NOMATCH. */ + + if (p >= end_subject) break; + + /* If we have found the required character, save the point where we + found it, so that we don't search again next time round the loop if + the start hasn't passed this character yet. */ + + req_byte_ptr = p; } } - else - { - while (p < end_subject) - { - if (*p++ == req_byte) { p--; break; } - } - } - - /* If we can't find the required character, break the matching loop, - which will cause a return or PCRE_ERROR_NOMATCH. */ - - if (p >= end_subject) break; - - /* If we have found the required character, save the point where we - found it, so that we don't search again next time round the loop if - the start hasn't passed this character yet. */ - - req_byte_ptr = p; } - } + } /* End of optimizations that are done when not restarting */ /* OK, now we can do the business */ + md->start_used_ptr = current_subject; + rc = internal_dfa_exec( md, /* fixed match data */ md->start_code, /* this subexpression's code */ diff --git a/harbour/external/pcre/pcreexec.c b/harbour/external/pcre/pcreexec.c index 91cf4dfb8a..51bc53e70f 100644 --- a/harbour/external/pcre/pcreexec.c +++ b/harbour/external/pcre/pcreexec.c @@ -398,10 +398,32 @@ typedef struct heapframe { /* This function is called recursively in many circumstances. Whenever it returns a negative (error) response, the outer incarnation must also return the -same response. +same response. */ -Performance note: It might be tempting to extract commonly used fields from the -md structure (e.g. utf8, end_subject) into individual variables to improve +/* These macros pack up tests that are used for partial matching, and which +appears several times in the code. We set the "hit end" flag if the pointer is +at the end of the subject and also past the start of the subject (i.e. +something has been matched). For hard partial matching, we then return +immediately. The second one is used when we already know we are past the end of +the subject. */ + +#define CHECK_PARTIAL()\ + if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\ + {\ + md->hitend = TRUE;\ + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\ + } + +#define SCHECK_PARTIAL()\ + if (md->partial != 0 && eptr > mstart)\ + {\ + md->hitend = TRUE;\ + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\ + } + + +/* Performance note: It might be tempting to extract commonly used fields from +the md structure (e.g. utf8, end_subject) into individual variables to improve performance. Tests using gcc on a SPARC disproved this; in the first case, it made performance worse. @@ -642,14 +664,6 @@ for (;;) minimize = possessive = FALSE; op = *ecode; - /* For partial matching, remember if we ever hit the end of the subject after - matching at least one subject character. */ - - if (md->partial && - eptr >= md->end_subject && - eptr > mstart) - md->hitend = TRUE; - switch(op) { case OP_FAIL: @@ -825,18 +839,139 @@ for (;;) /* Now see what the actual condition is */ - if (condcode == OP_RREF) /* Recursion test */ + if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */ { - offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ - condition = md->recursive != NULL && - (offset == RREF_ANY || offset == md->recursive->group_num); - ecode += condition? 3 : GET(ecode, 1); + if (md->recursive == NULL) /* Not recursing => FALSE */ + { + condition = FALSE; + ecode += GET(ecode, 1); + } + else + { + int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ + condition = (recno == RREF_ANY || recno == md->recursive->group_num); + + /* If the test is for recursion into a specific subpattern, and it is + false, but the test was set up by name, scan the table to see if the + name refers to any other numbers, and test them. The condition is true + if any one is set. */ + + if (!condition && condcode == OP_NRREF && recno != RREF_ANY) + { + uschar *slotA = md->name_table; + for (i = 0; i < md->name_count; i++) + { + if (GET2(slotA, 0) == recno) break; + slotA += md->name_entry_size; + } + + /* Found a name for the number - there can be only one; duplicate + names for different numbers are allowed, but not vice versa. First + scan down for duplicates. */ + + if (i < md->name_count) + { + uschar *slotB = slotA; + while (slotB > md->name_table) + { + slotB -= md->name_entry_size; + if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) + { + condition = GET2(slotB, 0) == md->recursive->group_num; + if (condition) break; + } + else break; + } + + /* Scan up for duplicates */ + + if (!condition) + { + slotB = slotA; + for (i++; i < md->name_count; i++) + { + slotB += md->name_entry_size; + if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) + { + condition = GET2(slotB, 0) == md->recursive->group_num; + if (condition) break; + } + else break; + } + } + } + } + + /* Chose branch according to the condition */ + + ecode += condition? 3 : GET(ecode, 1); + } } - else if (condcode == OP_CREF) /* Group used test */ + else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */ { offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ condition = offset < offset_top && md->offset_vector[offset] >= 0; + + /* If the numbered capture is unset, but the reference was by name, + scan the table to see if the name refers to any other numbers, and test + them. The condition is true if any one is set. This is tediously similar + to the code above, but not close enough to try to amalgamate. */ + + if (!condition && condcode == OP_NCREF) + { + int refno = offset >> 1; + uschar *slotA = md->name_table; + + for (i = 0; i < md->name_count; i++) + { + if (GET2(slotA, 0) == refno) break; + slotA += md->name_entry_size; + } + + /* Found a name for the number - there can be only one; duplicate names + for different numbers are allowed, but not vice versa. First scan down + for duplicates. */ + + if (i < md->name_count) + { + uschar *slotB = slotA; + while (slotB > md->name_table) + { + slotB -= md->name_entry_size; + if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) + { + offset = GET2(slotB, 0) << 1; + condition = offset < offset_top && + md->offset_vector[offset] >= 0; + if (condition) break; + } + else break; + } + + /* Scan up for duplicates */ + + if (!condition) + { + slotB = slotA; + for (i++; i < md->name_count; i++) + { + slotB += md->name_entry_size; + if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) + { + offset = GET2(slotB, 0) << 1; + condition = offset < offset_top && + md->offset_vector[offset] >= 0; + if (condition) break; + } + else break; + } + } + } + } + + /* Chose branch according to the condition */ + ecode += condition? 3 : GET(ecode, 1); } @@ -897,6 +1032,30 @@ for (;;) break; + /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, + to close any currently open capturing brackets. */ + + case OP_CLOSE: + number = GET2(ecode, 1); + offset = number << 1; + +#ifdef DEBUG + printf("end bracket %d at *ACCEPT", number); + printf("\n"); +#endif + + md->capture_last = number; + if (offset >= md->offset_max) md->offset_overflow = TRUE; else + { + md->offset_vector[offset] = + md->offset_vector[md->offset_end - number]; + md->offset_vector[offset+1] = eptr - md->start_subject; + if (offset_top <= offset) offset_top = offset + 2; + } + ecode += 3; + break; + + /* End of the pattern, either real or forced. If we are in a top-level recursion, we should restore the offsets appropriately and continue from after the call. */ @@ -910,16 +1069,26 @@ for (;;) md->recursive = rec->prevrec; memmove(md->offset_vector, rec->offset_save, rec->saved_max * sizeof(int)); + offset_top = rec->save_offset_top; mstart = rec->save_start; ims = original_ims; ecode = rec->after_call; break; } - /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty - string - backtracking will then try other alternatives, if any. */ + /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is + set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of + the subject. In both cases, backtracking will then try other alternatives, + if any. */ + + if (eptr == mstart && + (md->notempty || + (md->notempty_atstart && + mstart == md->start_subject + md->start_offset))) + RRETURN(MATCH_NOMATCH); + + /* Otherwise, we have a match. */ - if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH); md->end_match_ptr = eptr; /* Record where we ended */ md->end_offset_top = offset_top; /* and how many extracts were taken */ md->start_match_ptr = mstart; /* and the start (\K can modify) */ @@ -1010,8 +1179,9 @@ for (;;) if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); } - /* Skip to next op code */ + /* Save the earliest consulted character, then skip to next op code */ + if (eptr < md->start_used_ptr) md->start_used_ptr = eptr; ecode += 1 + LINK_SIZE; break; @@ -1091,6 +1261,7 @@ for (;;) memcpy(new_recursive.offset_save, md->offset_vector, new_recursive.saved_max * sizeof(int)); new_recursive.save_start = mstart; + new_recursive.save_offset_top = offset_top; mstart = eptr; /* OK, now we can do the recursion. For each top-level alternative we @@ -1315,6 +1486,7 @@ for (;;) mstart = rec->save_start; memcpy(md->offset_vector, rec->offset_save, rec->saved_max * sizeof(int)); + offset_top = rec->save_offset_top; ecode = rec->after_call; ims = original_ims; break; @@ -1454,7 +1626,8 @@ for (;;) /* Find out if the previous and current characters are "word" characters. It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to - be "non-word" characters. */ + be "non-word" characters. Remember the earliest consulted character for + partial matching. */ #ifdef SUPPORT_UTF8 if (utf8) @@ -1463,10 +1636,16 @@ for (;;) { USPTR lastptr = eptr - 1; while((*lastptr & 0xc0) == 0x80) lastptr--; + if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; GETCHAR(c, lastptr); prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; } - if (eptr >= md->end_subject) cur_is_word = FALSE; else + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + cur_is_word = FALSE; + } + else { GETCHAR(c, eptr); cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; @@ -1475,13 +1654,20 @@ for (;;) else #endif - /* More streamlined when not in UTF-8 mode */ + /* Not in UTF-8 mode */ { - prev_is_word = (eptr != md->start_subject) && - ((md->ctypes[eptr[-1]] & ctype_word) != 0); - cur_is_word = (eptr < md->end_subject) && - ((md->ctypes[*eptr] & ctype_word) != 0); + if (eptr == md->start_subject) prev_is_word = FALSE; else + { + if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1; + prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0); + } + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + cur_is_word = FALSE; + } + else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0); } /* Now see if the situation is what we want */ @@ -1499,7 +1685,11 @@ for (;;) /* Fall through */ case OP_ALLANY: - if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr++ >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; ecode++; break; @@ -1508,12 +1698,20 @@ for (;;) any byte, even newline, independent of the setting of PCRE_DOTALL. */ case OP_ANYBYTE: - if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr++ >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } ecode++; break; case OP_NOT_DIGIT: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 @@ -1526,7 +1724,11 @@ for (;;) break; case OP_DIGIT: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 @@ -1539,7 +1741,11 @@ for (;;) break; case OP_NOT_WHITESPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 @@ -1552,7 +1758,11 @@ for (;;) break; case OP_WHITESPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 @@ -1565,7 +1775,11 @@ for (;;) break; case OP_NOT_WORDCHAR: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 @@ -1578,7 +1792,11 @@ for (;;) break; case OP_WORDCHAR: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 @@ -1591,7 +1809,11 @@ for (;;) break; case OP_ANYNL: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); switch(c) { @@ -1615,7 +1837,11 @@ for (;;) break; case OP_NOT_HSPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); switch(c) { @@ -1645,7 +1871,11 @@ for (;;) break; case OP_HSPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); switch(c) { @@ -1675,7 +1905,11 @@ for (;;) break; case OP_NOT_VSPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); switch(c) { @@ -1693,7 +1927,11 @@ for (;;) break; case OP_VSPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); switch(c) { @@ -1716,7 +1954,11 @@ for (;;) case OP_PROP: case OP_NOTPROP: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); { const ucd_record *prop = GET_UCD(c); @@ -1761,7 +2003,11 @@ for (;;) is in the binary; otherwise a compile-time error occurs. */ case OP_EXTUNI: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); { int category = UCD_CATEGORY(c); @@ -1841,7 +2087,11 @@ for (;;) break; default: /* No repeat follows */ - if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); + if (!match_ref(offset, eptr, length, md, ims)) + { + CHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } eptr += length; continue; /* With the main loop */ } @@ -1857,7 +2107,11 @@ for (;;) for (i = 1; i <= min; i++) { - if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); + if (!match_ref(offset, eptr, length, md, ims)) + { + CHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } eptr += length; } @@ -1874,8 +2128,12 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || !match_ref(offset, eptr, length, md, ims)) + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (!match_ref(offset, eptr, length, md, ims)) + { + CHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); + } eptr += length; } /* Control never gets here */ @@ -1888,7 +2146,11 @@ for (;;) pp = eptr; for (i = min; i < max; i++) { - if (!match_ref(offset, eptr, length, md, ims)) break; + if (!match_ref(offset, eptr, length, md, ims)) + { + CHECK_PARTIAL(); + break; + } eptr += length; } while (eptr >= pp) @@ -1902,8 +2164,6 @@ for (;;) } /* Control never gets here */ - - /* Match a bit-mapped character class, possibly repeatedly. This op code is used when all the characters in the class have values in the range 0-255, and either the matching is caseful, or the characters are in the range @@ -1958,7 +2218,11 @@ for (;;) { for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); if (c > 255) { @@ -1976,7 +2240,11 @@ for (;;) { for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } c = *eptr++; if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } @@ -2000,7 +2268,12 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); if (c > 255) { @@ -2020,7 +2293,12 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } c = *eptr++; if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } @@ -2041,7 +2319,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); if (c > 255) { @@ -2067,7 +2349,11 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } c = *eptr; if ((data[c/8] & (1 << (c&7))) == 0) break; eptr++; @@ -2129,7 +2415,11 @@ for (;;) for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } @@ -2148,7 +2438,12 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } @@ -2163,7 +2458,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLENTEST(c, eptr, len); if (!_pcre_xclass(c, data)) break; eptr += len; @@ -2191,7 +2490,11 @@ for (;;) length = 1; ecode++; GETCHARLEN(fc, ecode, length); - if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); + if (length > md->end_subject - eptr) + { + CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ + RRETURN(MATCH_NOMATCH); + } while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); } else @@ -2199,7 +2502,11 @@ for (;;) /* Non-UTF-8 mode */ { - if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); + if (md->end_subject - eptr < 1) + { + SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ + RRETURN(MATCH_NOMATCH); + } if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); ecode += 2; } @@ -2215,7 +2522,11 @@ for (;;) ecode++; GETCHARLEN(fc, ecode, length); - if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); + if (length > md->end_subject - eptr) + { + CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ + RRETURN(MATCH_NOMATCH); + } /* If the pattern character's value is < 128, we have only one byte, and can use the fast lookup table. */ @@ -2250,7 +2561,11 @@ for (;;) /* Non-UTF-8 mode */ { - if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); + if (md->end_subject - eptr < 1) + { + SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ + RRETURN(MATCH_NOMATCH); + } if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); ecode += 2; } @@ -2304,13 +2619,12 @@ for (;;) case OP_MINQUERY: c = *ecode++ - OP_STAR; minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; - /* Common code for all repeated single-character matches. We can give - up quickly if there are fewer than the minimum number of characters left in - the subject. */ + /* Common code for all repeated single-character matches. */ REPEATCHAR: #ifdef SUPPORT_UTF8 @@ -2319,7 +2633,6 @@ for (;;) length = 1; charptr = ecode; GETCHARLEN(fc, ecode, length); - if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); ecode += length; /* Handle multibyte character matching specially here. There is @@ -2337,18 +2650,18 @@ for (;;) for (i = 1; i <= min; i++) { - if (memcmp(eptr, charptr, length) == 0) eptr += length; + if (eptr <= md->end_subject - length && + memcmp(eptr, charptr, length) == 0) eptr += length; #ifdef SUPPORT_UCP - /* Need braces because of following else */ - else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } + else if (oclength > 0 && + eptr <= md->end_subject - oclength && + memcmp(eptr, occhars, oclength) == 0) eptr += oclength; +#endif /* SUPPORT_UCP */ else { - if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); - eptr += oclength; + CHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); } -#else /* without SUPPORT_UCP */ - else { RRETURN(MATCH_NOMATCH); } -#endif /* SUPPORT_UCP */ } if (min == max) continue; @@ -2359,19 +2672,19 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - if (memcmp(eptr, charptr, length) == 0) eptr += length; + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr <= md->end_subject - length && + memcmp(eptr, charptr, length) == 0) eptr += length; #ifdef SUPPORT_UCP - /* Need braces because of following else */ - else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } + else if (oclength > 0 && + eptr <= md->end_subject - oclength && + memcmp(eptr, occhars, oclength) == 0) eptr += oclength; +#endif /* SUPPORT_UCP */ else { - if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); - eptr += oclength; + CHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); } -#else /* without SUPPORT_UCP */ - else { RRETURN (MATCH_NOMATCH); } -#endif /* SUPPORT_UCP */ } /* Control never gets here */ } @@ -2381,33 +2694,34 @@ for (;;) pp = eptr; for (i = min; i < max; i++) { - if (eptr > md->end_subject - length) break; - if (memcmp(eptr, charptr, length) == 0) eptr += length; + if (eptr <= md->end_subject - length && + memcmp(eptr, charptr, length) == 0) eptr += length; #ifdef SUPPORT_UCP - else if (oclength == 0) break; + else if (oclength > 0 && + eptr <= md->end_subject - oclength && + memcmp(eptr, occhars, oclength) == 0) eptr += oclength; +#endif /* SUPPORT_UCP */ else { - if (memcmp(eptr, occhars, oclength) != 0) break; - eptr += oclength; + CHECK_PARTIAL(); + break; } -#else /* without SUPPORT_UCP */ - else break; -#endif /* SUPPORT_UCP */ } if (possessive) continue; + for(;;) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr == pp) RRETURN(MATCH_NOMATCH); + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (eptr == pp) { RRETURN(MATCH_NOMATCH); } #ifdef SUPPORT_UCP - eptr--; - BACKCHAR(eptr); + eptr--; + BACKCHAR(eptr); #else /* without SUPPORT_UCP */ - eptr -= length; + eptr -= length; #endif /* SUPPORT_UCP */ - } + } } /* Control never gets here */ } @@ -2420,10 +2734,8 @@ for (;;) #endif /* SUPPORT_UTF8 */ /* When not in UTF-8 mode, load a single-byte character. */ - { - if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); - fc = *ecode++; - } + + fc = *ecode++; /* The value of fc at this point is always less than 256, though we may or may not be in UTF-8 mode. The code is duplicated for the caseless and @@ -2441,7 +2753,14 @@ for (;;) { fc = md->lcc[fc]; for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + } if (min == max) continue; if (minimize) { @@ -2449,9 +2768,13 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || - fc != md->lcc[*eptr++]) + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); + } + if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -2460,10 +2783,17 @@ for (;;) pp = eptr; for (i = min; i < max; i++) { - if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (fc != md->lcc[*eptr]) break; eptr++; } + if (possessive) continue; + while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25); @@ -2479,16 +2809,31 @@ for (;;) else { - for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH); + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (fc != *eptr++) RRETURN(MATCH_NOMATCH); + } + if (min == max) continue; + if (minimize) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || fc != *eptr++) + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); + } + if (fc != *eptr++) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -2497,10 +2842,16 @@ for (;;) pp = eptr; for (i = min; i < max; i++) { - if (eptr >= md->end_subject || fc != *eptr) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (fc != *eptr) break; eptr++; } if (possessive) continue; + while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27); @@ -2516,7 +2867,11 @@ for (;;) checking can be multibyte. */ case OP_NOT: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } ecode++; GETCHARINCTEST(c, eptr); if ((ims & PCRE_CASELESS) != 0) @@ -2593,12 +2948,9 @@ for (;;) max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; - /* Common code for all repeated single-byte matches. We can give up quickly - if there are fewer than the minimum number of bytes left in the - subject. */ + /* Common code for all repeated single-byte matches. */ REPEATNOTCHAR: - if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); fc = *ecode++; /* The code is duplicated for the caseless and caseful cases, for speed, @@ -2623,6 +2975,11 @@ for (;;) register unsigned int d; for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; if (fc == d) RRETURN(MATCH_NOMATCH); @@ -2634,7 +2991,14 @@ for (;;) /* Not UTF-8 mode */ { for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + } } if (min == max) continue; @@ -2650,11 +3014,15 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; if (fc == d) RRETURN(MATCH_NOMATCH); - } } else @@ -2665,8 +3033,13 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++]) + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); + } + if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2686,7 +3059,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(d, eptr, len); if (d < 256) d = md->lcc[d]; if (fc == d) break; @@ -2707,7 +3084,12 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (fc == md->lcc[*eptr]) break; eptr++; } if (possessive) continue; @@ -2735,6 +3117,11 @@ for (;;) register unsigned int d; for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(d, eptr); if (fc == d) RRETURN(MATCH_NOMATCH); } @@ -2744,7 +3131,14 @@ for (;;) /* Not UTF-8 mode */ { for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } if (fc == *eptr++) RRETURN(MATCH_NOMATCH); + } } if (min == max) continue; @@ -2760,7 +3154,12 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(d, eptr); if (fc == d) RRETURN(MATCH_NOMATCH); } @@ -2773,8 +3172,13 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || fc == *eptr++) + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); + } + if (fc == *eptr++) RRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2794,7 +3198,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(d, eptr, len); if (fc == d) break; eptr += len; @@ -2814,7 +3222,12 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject || fc == *eptr) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (fc == *eptr) break; eptr++; } if (possessive) continue; @@ -2908,13 +3321,10 @@ for (;;) /* First, ensure the minimum number of matches are present. Use inline code for maximizing the speed, and do the type test once at the start - (i.e. keep it out of the loop). Also we can test that there are at least - the minimum number of bytes before we start. This isn't as effective in - UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that + (i.e. keep it out of the loop). Separate the UTF-8 code completely as that is tidier. Also separate the UCP code, which can be the same for both UTF-8 and single-bytes. */ - if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); if (min > 0) { #ifdef SUPPORT_UCP @@ -2926,7 +3336,11 @@ for (;;) if (prop_fail_result) RRETURN(MATCH_NOMATCH); for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); } break; @@ -2934,7 +3348,11 @@ for (;;) case PT_LAMP: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || @@ -2947,7 +3365,11 @@ for (;;) case PT_GC: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) @@ -2958,7 +3380,11 @@ for (;;) case PT_PC: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) @@ -2969,7 +3395,11 @@ for (;;) case PT_SC: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) @@ -2989,16 +3419,19 @@ for (;;) { for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; - if (!utf8) c = *eptr; else - { - GETCHARLEN(c, eptr, len); - } + if (!utf8) c = *eptr; + else { GETCHARLEN(c, eptr, len); } prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; @@ -3017,8 +3450,12 @@ for (;;) case OP_ANY: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); + } + if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } @@ -3027,20 +3464,29 @@ for (;;) case OP_ALLANY: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } break; case OP_ANYBYTE: + if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH); eptr += min; break; case OP_ANYNL: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); switch(c) { @@ -3066,7 +3512,11 @@ for (;;) case OP_NOT_HSPACE: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); switch(c) { @@ -3098,7 +3548,11 @@ for (;;) case OP_HSPACE: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); switch(c) { @@ -3130,7 +3584,11 @@ for (;;) case OP_NOT_VSPACE: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); switch(c) { @@ -3150,7 +3608,11 @@ for (;;) case OP_VSPACE: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); switch(c) { @@ -3170,7 +3632,11 @@ for (;;) case OP_NOT_DIGIT: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); @@ -3180,8 +3646,12 @@ for (;;) case OP_DIGIT: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject || - *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } @@ -3190,8 +3660,12 @@ for (;;) case OP_NOT_WHITESPACE: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject || - (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)) + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); } @@ -3200,8 +3674,12 @@ for (;;) case OP_WHITESPACE: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject || - *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } @@ -3220,8 +3698,12 @@ for (;;) case OP_WORDCHAR: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject || - *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } @@ -3235,34 +3717,49 @@ for (;;) #endif /* SUPPORT_UTF8 */ /* Code for the non-UTF-8 case for minimum matching of operators other - than OP_PROP and OP_NOTPROP. We can assume that there are the minimum - number of bytes present, as this was tested above. */ + than OP_PROP and OP_NOTPROP. */ switch(ctype) { case OP_ANY: for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); eptr++; } break; case OP_ALLANY: + if (eptr > md->end_subject - min) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } eptr += min; break; case OP_ANYBYTE: + if (eptr > md->end_subject - min) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } eptr += min; break; - /* Because of the CRLF case, we can't assume the minimum number of - bytes are present in this case. */ - case OP_ANYNL: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); @@ -3284,7 +3781,11 @@ for (;;) case OP_NOT_HSPACE: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } switch(*eptr++) { default: break; @@ -3299,7 +3800,11 @@ for (;;) case OP_HSPACE: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); @@ -3314,7 +3819,11 @@ for (;;) case OP_NOT_VSPACE: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } switch(*eptr++) { default: break; @@ -3331,7 +3840,11 @@ for (;;) case OP_VSPACE: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); @@ -3347,34 +3860,76 @@ for (;;) case OP_NOT_DIGIT: for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); + } break; case OP_DIGIT: for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); + } break; case OP_NOT_WHITESPACE: for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); + } break; case OP_WHITESPACE: for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); + } break; case OP_NOT_WORDCHAR: for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } if ((md->ctypes[*eptr++] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); + } break; case OP_WORDCHAR: for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } if ((md->ctypes[*eptr++] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); + } break; default: @@ -3402,7 +3957,12 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); if (prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -3413,7 +3973,12 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || @@ -3428,7 +3993,12 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) @@ -3441,7 +4011,12 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) @@ -3454,7 +4029,12 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINC(c, eptr); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) @@ -3476,17 +4056,20 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; - if (!utf8) c = *eptr; else - { - GETCHARLEN(c, eptr, len); - } + if (!utf8) c = *eptr; + else { GETCHARLEN(c, eptr, len); } prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; @@ -3505,10 +4088,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || - (ctype == OP_ANY && IS_NEWLINE(eptr))) + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (ctype == OP_ANY && IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); switch(ctype) { @@ -3664,10 +4251,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || - (ctype == OP_ANY && IS_NEWLINE(eptr))) + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (ctype == OP_ANY && IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - c = *eptr++; switch(ctype) { @@ -3792,7 +4383,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); if (prop_fail_result) break; eptr+= len; @@ -3803,7 +4398,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || @@ -3818,7 +4417,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) @@ -3831,7 +4434,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) @@ -3844,7 +4451,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) @@ -3873,7 +4484,11 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) break; @@ -3893,6 +4508,7 @@ for (;;) /* eptr is now past the end of the maximum run */ if (possessive) continue; + for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45); @@ -3928,7 +4544,12 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (IS_NEWLINE(eptr)) break; eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } @@ -3940,7 +4561,12 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (IS_NEWLINE(eptr)) break; eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } @@ -3952,7 +4578,11 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } @@ -3965,15 +4595,22 @@ for (;;) case OP_ANYBYTE: c = max - min; if (c > (unsigned int)(md->end_subject - eptr)) - c = md->end_subject - eptr; - eptr += c; + { + eptr = md->end_subject; + SCHECK_PARTIAL(); + } + else eptr += c; break; case OP_ANYNL: for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); if (c == 0x000d) { @@ -3998,7 +4635,11 @@ for (;;) { BOOL gotspace; int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); switch(c) { @@ -4036,7 +4677,11 @@ for (;;) { BOOL gotspace; int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); switch(c) { @@ -4060,7 +4705,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; eptr+= len; @@ -4071,7 +4720,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; eptr+= len; @@ -4082,7 +4735,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; eptr+= len; @@ -4093,7 +4750,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; eptr+= len; @@ -4104,7 +4765,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; eptr+= len; @@ -4115,7 +4780,11 @@ for (;;) for (i = min; i < max; i++) { int len = 1; - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } GETCHARLEN(c, eptr, len); if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; eptr+= len; @@ -4147,7 +4816,12 @@ for (;;) case OP_ANY: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (IS_NEWLINE(eptr)) break; eptr++; } break; @@ -4156,14 +4830,21 @@ for (;;) case OP_ANYBYTE: c = max - min; if (c > (unsigned int)(md->end_subject - eptr)) - c = md->end_subject - eptr; - eptr += c; + { + eptr = md->end_subject; + SCHECK_PARTIAL(); + } + else eptr += c; break; case OP_ANYNL: for (i = min; i < max; i++) { - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } c = *eptr; if (c == 0x000d) { @@ -4184,7 +4865,11 @@ for (;;) case OP_NOT_HSPACE: for (i = min; i < max; i++) { - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } c = *eptr; if (c == 0x09 || c == 0x20 || c == 0xa0) break; eptr++; @@ -4194,7 +4879,11 @@ for (;;) case OP_HSPACE: for (i = min; i < max; i++) { - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } c = *eptr; if (c != 0x09 && c != 0x20 && c != 0xa0) break; eptr++; @@ -4204,7 +4893,11 @@ for (;;) case OP_NOT_VSPACE: for (i = min; i < max; i++) { - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } c = *eptr; if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85) break; @@ -4215,7 +4908,11 @@ for (;;) case OP_VSPACE: for (i = min; i < max; i++) { - if (eptr >= md->end_subject) break; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } c = *eptr; if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85) break; @@ -4226,8 +4923,12 @@ for (;;) case OP_NOT_DIGIT: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0) + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); break; + } + if ((md->ctypes[*eptr] & ctype_digit) != 0) break; eptr++; } break; @@ -4235,8 +4936,12 @@ for (;;) case OP_DIGIT: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0) + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); break; + } + if ((md->ctypes[*eptr] & ctype_digit) == 0) break; eptr++; } break; @@ -4244,8 +4949,12 @@ for (;;) case OP_NOT_WHITESPACE: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0) + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); break; + } + if ((md->ctypes[*eptr] & ctype_space) != 0) break; eptr++; } break; @@ -4253,8 +4962,12 @@ for (;;) case OP_WHITESPACE: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0) + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); break; + } + if ((md->ctypes[*eptr] & ctype_space) == 0) break; eptr++; } break; @@ -4262,8 +4975,12 @@ for (;;) case OP_NOT_WORDCHAR: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0) + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); break; + } + if ((md->ctypes[*eptr] & ctype_word) != 0) break; eptr++; } break; @@ -4271,8 +4988,12 @@ for (;;) case OP_WORDCHAR: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0) + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); break; + } + if ((md->ctypes[*eptr] & ctype_word) == 0) break; eptr++; } break; @@ -4450,6 +5171,7 @@ const uschar *tables; const uschar *start_bits = NULL; USPTR start_match = (USPTR)subject + start_offset; USPTR end_subject; +USPTR start_partial = NULL; USPTR req_byte_ptr = start_match - 1; pcre_study_data internal_study; @@ -4466,6 +5188,13 @@ if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; +/* This information is for finding all the numbers associated with a given +name, for condition testing. */ + +md->name_table = (uschar *)re + re->name_table_offset; +md->name_count = re->name_count; +md->name_entry_size = re->name_entry_size; + /* Fish out the optional data from the extra_data structure, first setting the default values. */ @@ -4533,7 +5262,9 @@ md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; md->notbol = (options & PCRE_NOTBOL) != 0; md->noteol = (options & PCRE_NOTEOL) != 0; md->notempty = (options & PCRE_NOTEMPTY) != 0; -md->partial = (options & PCRE_PARTIAL) != 0; +md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0; +md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : + ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; md->hitend = FALSE; md->recursive = NULL; /* No recursion at top level */ @@ -4607,8 +5338,9 @@ else } } -/* Partial matching is supported only for a restricted set of regexes at the -moment. */ +/* Partial matching was originally supported only for a restricted set of +regexes; from release 8.00 there are no restrictions, but the bits are still +defined (though never set). So there's no harm in leaving this code. */ if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) return PCRE_ERROR_BADPARTIAL; @@ -4695,7 +5427,7 @@ if (!anchored) } else if (!startline && study != NULL && - (study->options & PCRE_STUDY_MAPPED) != 0) + (study->flags & PCRE_STUDY_MAPPED) != 0) start_bits = study->start_bits; } @@ -4822,79 +5554,94 @@ for(;;) end_subject = save_end_subject; + /* The following two optimizations are disabled for partial matching or if + disabling is explicitly requested. */ + + if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial) + { + /* If the pattern was studied, a minimum subject length may be set. This is + a lower bound; no actual string of that length may actually match the + pattern. Although the value is, strictly, in characters, we treat it as + bytes to avoid spending too much time in this optimization. */ + + if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && + end_subject - start_match < study->minlength) + { + rc = MATCH_NOMATCH; + break; + } + + /* If req_byte is set, we know that that character must appear in the + subject for the match to succeed. If the first character is set, req_byte + must be later in the subject; otherwise the test starts at the match point. + This optimization can save a huge amount of backtracking in patterns with + nested unlimited repeats that aren't going to match. Writing separate code + for cased/caseless versions makes it go faster, as does using an + autoincrement and backing off on a match. + + HOWEVER: when the subject string is very, very long, searching to its end + can take a long time, and give bad performance on quite ordinary patterns. + This showed up when somebody was matching something like /^\d+C/ on a + 32-megabyte string... so we don't do this when the string is sufficiently + long. */ + + if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) + { + register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); + + /* We don't need to repeat the search if we haven't yet reached the + place we found it at last time. */ + + if (p > req_byte_ptr) + { + if (req_byte_caseless) + { + while (p < end_subject) + { + register int pp = *p++; + if (pp == req_byte || pp == req_byte2) { p--; break; } + } + } + else + { + while (p < end_subject) + { + if (*p++ == req_byte) { p--; break; } + } + } + + /* If we can't find the required character, break the matching loop, + forcing a match failure. */ + + if (p >= end_subject) + { + rc = MATCH_NOMATCH; + break; + } + + /* If we have found the required character, save the point where we + found it, so that we don't search again next time round the loop if + the start hasn't passed this character yet. */ + + req_byte_ptr = p; + } + } + } + #ifdef DEBUG /* Sigh. Some compilers never learn. */ printf(">>>> Match against: "); pchars(start_match, end_subject - start_match, TRUE, md); printf("\n"); #endif - /* If req_byte is set, we know that that character must appear in the - subject for the match to succeed. If the first character is set, req_byte - must be later in the subject; otherwise the test starts at the match point. - This optimization can save a huge amount of backtracking in patterns with - nested unlimited repeats that aren't going to match. Writing separate code - for cased/caseless versions makes it go faster, as does using an - autoincrement and backing off on a match. - - HOWEVER: when the subject string is very, very long, searching to its end - can take a long time, and give bad performance on quite ordinary patterns. - This showed up when somebody was matching something like /^\d+C/ on a - 32-megabyte string... so we don't do this when the string is sufficiently - long. - - ALSO: this processing is disabled when partial matching is requested, or if - disabling is explicitly requested. */ - - if ((options & PCRE_NO_START_OPTIMIZE) == 0 && - req_byte >= 0 && - end_subject - start_match < REQ_BYTE_MAX && - !md->partial) - { - register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); - - /* We don't need to repeat the search if we haven't yet reached the - place we found it at last time. */ - - if (p > req_byte_ptr) - { - if (req_byte_caseless) - { - while (p < end_subject) - { - register int pp = *p++; - if (pp == req_byte || pp == req_byte2) { p--; break; } - } - } - else - { - while (p < end_subject) - { - if (*p++ == req_byte) { p--; break; } - } - } - - /* If we can't find the required character, break the matching loop, - forcing a match failure. */ - - if (p >= end_subject) - { - rc = MATCH_NOMATCH; - break; - } - - /* If we have found the required character, save the point where we - found it, so that we don't search again next time round the loop if - the start hasn't passed this character yet. */ - - req_byte_ptr = p; - } - } - - /* OK, we can now run the match. */ + /* OK, we can now run the match. If "hitend" is set afterwards, remember the + first starting point for which a partial match was found. */ md->start_match_ptr = start_match; + md->start_used_ptr = start_match; md->match_call_count = 0; rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0); + if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr; switch(rc) { @@ -4924,7 +5671,7 @@ for(;;) rc = MATCH_NOMATCH; goto ENDLOOP; - /* Any other return is some kind of error. */ + /* Any other return is either a match, or some kind of error. */ default: goto ENDLOOP; @@ -5030,14 +5777,19 @@ if (using_temporary_offsets) (pcre_free)(md->offset_vector); } -if (rc != MATCH_NOMATCH) +if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) { DPRINTF((">>>> error: returning %d\n", rc)); return rc; } -else if (md->partial && md->hitend) +else if (start_partial != NULL) { DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); + if (offsetcount > 1) + { + offsets[0] = start_partial - (USPTR)subject; + offsets[1] = end_subject - (USPTR)subject; + } return PCRE_ERROR_PARTIAL; } else diff --git a/harbour/external/pcre/pcrefinf.c b/harbour/external/pcre/pcrefinf.c index d82870b459..9460c38a42 100644 --- a/harbour/external/pcre/pcrefinf.c +++ b/harbour/external/pcre/pcrefinf.c @@ -119,10 +119,16 @@ switch (what) case PCRE_INFO_FIRSTTABLE: *((const uschar **)where) = - (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)? + (study != NULL && (study->flags & PCRE_STUDY_MAPPED) != 0)? ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL; break; + case PCRE_INFO_MINLENGTH: + *((int *)where) = + (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0)? + study->minlength : -1; + break; + case PCRE_INFO_LASTLITERAL: *((int *)where) = ((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1; @@ -144,6 +150,9 @@ switch (what) *((const uschar **)where) = (const uschar *)(_pcre_default_tables); break; + /* From release 8.00 this will always return TRUE because NOPARTIAL is + no longer ever set (the restrictions have been removed). */ + case PCRE_INFO_OKPARTIAL: *((int *)where) = (re->flags & PCRE_NOPARTIAL) == 0; break; diff --git a/harbour/external/pcre/pcreinal.h b/harbour/external/pcre/pcreinal.h index 8bba4acf91..5fbc5ca898 100644 --- a/harbour/external/pcre/pcreinal.h +++ b/harbour/external/pcre/pcreinal.h @@ -535,7 +535,9 @@ Standard C system should have one. */ /* Private flags containing information about the compiled regex. They used to live at the top end of the options word, but that got almost full, so now they -are in a 16-bit flags word. */ +are in a 16-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as +the restrictions on partial matching have been lifted. It remains for backwards +compatibility. */ #define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */ #define PCRE_FIRSTSET 0x0002 /* first_byte is set */ @@ -547,6 +549,7 @@ are in a 16-bit flags word. */ /* Options for the "extra" block produced by pcre_study(). */ #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ +#define PCRE_STUDY_MINLEN 0x02 /* a minimum length field exists */ /* Masks for identifying the public options that are permitted at compile time, run time, or study time, respectively. */ @@ -562,14 +565,15 @@ time, run time, or study time, respectively. */ PCRE_JAVASCRIPT_COMPAT) #define PUBLIC_EXEC_OPTIONS \ - (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ - PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ - PCRE_NO_START_OPTIMIZE) + (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \ + PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_NEWLINE_BITS| \ + PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE) #define PUBLIC_DFA_EXEC_OPTIONS \ - (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ - PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \ - PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE) + (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \ + PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_SHORTEST| \ + PCRE_DFA_RESTART|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ + PCRE_NO_START_OPTIMIZE) #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ @@ -598,7 +602,6 @@ variable-length repeat, or a anything other than literal characters. */ environments where these macros are defined elsewhere. Unfortunately, there is no way to do the same for the typedef. */ - #ifndef FALSE typedef int BOOL; #define FALSE 0 @@ -1206,8 +1209,8 @@ enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, OP_EOD must correspond in order to the list of escapes immediately above. *** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions -that follow must also be updated to match. There is also a table called -"coptable" in pcre_dfa_exec.c that must be updated. */ +that follow must also be updated to match. There are also tables called +"coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */ enum { OP_END, /* 0 End of pattern */ @@ -1343,30 +1346,39 @@ enum { OP_SCBRA, /* 98 Start of capturing bracket, check empty */ OP_SCOND, /* 99 Conditional group, check empty */ - OP_CREF, /* 100 Used to hold a capture number as condition */ - OP_RREF, /* 101 Used to hold a recursion number as condition */ - OP_DEF, /* 102 The DEFINE condition */ + /* The next two pairs must (respectively) be kept together. */ - OP_BRAZERO, /* 103 These two must remain together and in this */ - OP_BRAMINZERO, /* 104 order. */ + OP_CREF, /* 100 Used to hold a capture number as condition */ + OP_NCREF, /* 101 Same, but generaged by a name reference*/ + OP_RREF, /* 102 Used to hold a recursion number as condition */ + OP_NRREF, /* 103 Same, but generaged by a name reference*/ + OP_DEF, /* 104 The DEFINE condition */ + + OP_BRAZERO, /* 105 These two must remain together and in this */ + OP_BRAMINZERO, /* 106 order. */ /* These are backtracking control verbs */ - OP_PRUNE, /* 105 */ - OP_SKIP, /* 106 */ - OP_THEN, /* 107 */ - OP_COMMIT, /* 108 */ + OP_PRUNE, /* 107 */ + OP_SKIP, /* 108 */ + OP_THEN, /* 109 */ + OP_COMMIT, /* 110 */ /* These are forced failure and success verbs */ - OP_FAIL, /* 109 */ - OP_ACCEPT, /* 110 */ + OP_FAIL, /* 111 */ + OP_ACCEPT, /* 112 */ + OP_CLOSE, /* 113 Used before OP_ACCEPT to close open captures */ /* This is used to skip a subpattern with a {0} quantifier */ - OP_SKIPZERO /* 111 */ + OP_SKIPZERO /* 114 */ }; +/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro +definitions that follow must also be updated to match. There are also tables +called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */ + /* This macro defines textual names for all the opcodes. These are used only for debugging. The macro is referenced only in pcre_printint.c. */ @@ -1388,9 +1400,10 @@ for debugging. The macro is referenced only in pcre_printint.c. */ "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ "AssertB", "AssertB not", "Reverse", \ "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \ - "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \ + "Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \ + "Brazero", "Braminzero", \ "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \ - "Skip zero" + "Close", "Skip zero" /* This macro defines the length of fixed length operations in the compiled @@ -1450,15 +1463,16 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1+LINK_SIZE, /* SBRA */ \ 3+LINK_SIZE, /* SCBRA */ \ 1+LINK_SIZE, /* SCOND */ \ - 3, /* CREF */ \ - 3, /* RREF */ \ + 3, 3, /* CREF, NCREF */ \ + 3, 3, /* RREF, NRREF */ \ 1, /* DEF */ \ 1, 1, /* BRAZERO, BRAMINZERO */ \ 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \ - 1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */ + 1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ -/* A magic value for OP_RREF to indicate the "any recursion" condition. */ +/* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion" +condition. */ #define RREF_ANY 0xffff @@ -1471,7 +1485,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, - ERR60, ERR61, ERR62, ERR63, ERR64 }; + ERR60, ERR61, ERR62, ERR63, ERR64, ERR65 }; /* The real format of the start of the pcre block; the index of names and the code vector run on as long as necessary after the end. We store an explicit @@ -1487,7 +1501,7 @@ Because people can now save and re-use compiled patterns, any additions to this structure should be made at the end, and something earlier (e.g. a new flag in the options or one of the dummy fields) should indicate that the new fields are present. Currently PCRE always sets the dummy fields to zero. -NOTE NOTE NOTE: +NOTE NOTE NOTE */ typedef struct real_pcre { @@ -1514,10 +1528,20 @@ remark (see NOTE above) about extending this structure applies. */ typedef struct pcre_study_data { pcre_uint32 size; /* Total that was malloced */ - pcre_uint32 options; - uschar start_bits[32]; + pcre_uint32 flags; /* Private flags */ + uschar start_bits[32]; /* Starting char bits */ + pcre_uint32 minlength; /* Minimum subject length */ } pcre_study_data; +/* Structure for building a chain of open capturing subpatterns during +compiling, so that instructions to close them can be compiled when (*ACCEPT) is +encountered. */ + +typedef struct open_capitem { + struct open_capitem *next; /* Chain link */ + pcre_uint16 number; /* Capture number */ +} open_capitem; + /* Structure for passing "static" information around between the functions doing the compiling, so that they are thread-safe. */ @@ -1530,6 +1554,7 @@ typedef struct compile_data { const uschar *start_code; /* The start of the compiled code */ const uschar *start_pattern; /* The start of the pattern */ const uschar *end_pattern; /* The end of the pattern */ + open_capitem *open_caps; /* Chain of open capture items */ uschar *hwm; /* High watermark of workspace */ uschar *name_table; /* The name/number table */ int names_found; /* Number of entries so far */ @@ -1542,6 +1567,7 @@ typedef struct compile_data { int external_flags; /* External flag bits to be set */ int req_varyopt; /* "After variable item" flag for reqbyte */ BOOL had_accept; /* (*ACCEPT) encountered */ + BOOL check_lookbehind; /* Lookbehinds need later checking */ int nltype; /* Newline type */ int nllen; /* Newline string length */ uschar nl[4]; /* Newline string when fixed length */ @@ -1565,6 +1591,7 @@ typedef struct recursion_info { USPTR save_start; /* Old value of mstart */ int *offset_save; /* Pointer to start of saved offsets */ int saved_max; /* Number of saved offsets */ + int save_offset_top; /* Current value of offset_top */ } recursion_info; /* Structure for building a chain of data for holding the values of the subject @@ -1589,6 +1616,9 @@ typedef struct match_data { int offset_max; /* The maximum usable for return data */ int nltype; /* Newline type */ int nllen; /* Newline string length */ + int name_count; /* Number of names in name table */ + int name_entry_size; /* Size of entry in names table */ + uschar *name_table; /* Table of names */ uschar nl[4]; /* Newline string when fixed */ const uschar *lcc; /* Points to lower casing table */ const uschar *ctypes; /* Points to table of type maps */ @@ -1599,7 +1629,7 @@ typedef struct match_data { BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ BOOL endonly; /* Dollar not before final \n */ BOOL notempty; /* Empty string match not wanted */ - BOOL partial; /* PARTIAL flag */ + BOOL notempty_atstart; /* Empty string match at start not wanted */ BOOL hitend; /* Hit the end of the subject at some point */ BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ const uschar *start_code; /* For use when recursing */ @@ -1607,6 +1637,8 @@ typedef struct match_data { USPTR end_subject; /* End of the subject string */ USPTR start_match_ptr; /* Start of matched string */ USPTR end_match_ptr; /* Subject position at end match */ + USPTR start_used_ptr; /* Earliest consulted character */ + int partial; /* PARTIAL options */ int end_offset_top; /* Highwater mark at end of match */ int capture_last; /* Most recent capture number */ int start_offset; /* The start offset value */ @@ -1623,7 +1655,9 @@ typedef struct dfa_match_data { const uschar *start_code; /* Start of the compiled pattern */ const uschar *start_subject; /* Start of the subject string */ const uschar *end_subject; /* End of subject string */ + const uschar *start_used_ptr; /* Earliest consulted character */ const uschar *tables; /* Character tables */ + int start_offset; /* The start offset value */ int moptions; /* Match options */ int poptions; /* Pattern options */ int nltype; /* Newline type */ @@ -1702,15 +1736,16 @@ extern const uschar _pcre_OP_lengths[]; one of the exported public functions. They have to be "external" in the C sense, but are not part of the PCRE public API. */ -extern BOOL _pcre_is_newline(const uschar *, int, const uschar *, - int *, BOOL); -extern int _pcre_ord2utf8(int, uschar *); -extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, - const pcre_study_data *, pcre_study_data *); -extern int _pcre_valid_utf8(const uschar *, int); -extern BOOL _pcre_was_newline(const uschar *, int, const uschar *, - int *, BOOL); -extern BOOL _pcre_xclass(int, const uschar *); +extern const uschar *_pcre_find_bracket(const uschar *, BOOL, int); +extern BOOL _pcre_is_newline(const uschar *, int, const uschar *, + int *, BOOL); +extern int _pcre_ord2utf8(int, uschar *); +extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, + const pcre_study_data *, pcre_study_data *); +extern int _pcre_valid_utf8(const uschar *, int); +extern BOOL _pcre_was_newline(const uschar *, int, const uschar *, + int *, BOOL); +extern BOOL _pcre_xclass(int, const uschar *); /* Unicode character database (UCD) */ diff --git a/harbour/external/pcre/pcreprni.h b/harbour/external/pcre/pcreprni.h index 5f45fc1985..acfc4ca688 100644 --- a/harbour/external/pcre/pcreprni.h +++ b/harbour/external/pcre/pcreprni.h @@ -246,7 +246,12 @@ for(;;) fprintf(f, "%s", OP_names[*code]); break; + case OP_CLOSE: + fprintf(f, " %s %d", OP_names[*code], GET2(code, 1)); + break; + case OP_CREF: + case OP_NCREF: fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]); break; @@ -258,6 +263,14 @@ for(;;) fprintf(f, " Cond recurse %d", c); break; + case OP_NRREF: + c = GET2(code, 1); + if (c == RREF_ANY) + fprintf(f, " Cond nrecurse any"); + else + fprintf(f, " Cond nrecurse %d", c); + break; + case OP_DEF: fprintf(f, " Cond def"); break; diff --git a/harbour/external/pcre/pcrestud.c b/harbour/external/pcre/pcrestud.c index 4855ee20f5..86f763dfe5 100644 --- a/harbour/external/pcre/pcrestud.c +++ b/harbour/external/pcre/pcrestud.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2008 University of Cambridge + Copyright (c) 1997-2009 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -54,6 +54,364 @@ supporting functions. */ enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE }; + +/************************************************* +* Find the minimum subject length for a group * +*************************************************/ + +/* Scan a parenthesized group and compute the minimum length of subject that +is needed to match it. This is a lower bound; it does not mean there is a +string of that length that matches. In UTF8 mode, the result is in characters +rather than bytes. + +Arguments: + code pointer to start of group (the bracket) + startcode pointer to start of the whole pattern + options the compiling options + +Returns: the minimum length + -1 if \C was encountered + -2 internal error (missing capturing bracket) +*/ + +static int +find_minlength(const uschar *code, const uschar *startcode, int options) +{ +int length = -1; +BOOL utf8 = (options & PCRE_UTF8) != 0; +BOOL had_recurse = FALSE; +register int branchlength = 0; +register uschar *cc = (uschar *)code + 1 + LINK_SIZE; + +if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2; + +/* Scan along the opcodes for this branch. If we get to the end of the +branch, check the length against that of the other branches. */ + +for (;;) + { + int d, min; + uschar *cs, *ce; + register int op = *cc; + + switch (op) + { + case OP_CBRA: + case OP_SCBRA: + case OP_BRA: + case OP_SBRA: + case OP_ONCE: + case OP_COND: + case OP_SCOND: + d = find_minlength(cc, startcode, options); + if (d < 0) return d; + branchlength += d; + do cc += GET(cc, 1); while (*cc == OP_ALT); + cc += 1 + LINK_SIZE; + break; + + /* Reached end of a branch; if it's a ket it is the end of a nested + call. If it's ALT it is an alternation in a nested call. If it is + END it's the end of the outer call. All can be handled by the same code. */ + + case OP_ALT: + case OP_KET: + case OP_KETRMAX: + case OP_KETRMIN: + case OP_END: + if (length < 0 || (!had_recurse && branchlength < length)) + length = branchlength; + if (*cc != OP_ALT) return length; + cc += 1 + LINK_SIZE; + branchlength = 0; + had_recurse = FALSE; + break; + + /* Skip over assertive subpatterns */ + + case OP_ASSERT: + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + do cc += GET(cc, 1); while (*cc == OP_ALT); + /* Fall through */ + + /* Skip over things that don't match chars */ + + case OP_REVERSE: + case OP_CREF: + case OP_NCREF: + case OP_RREF: + case OP_NRREF: + case OP_DEF: + case OP_OPT: + case OP_CALLOUT: + case OP_SOD: + case OP_SOM: + case OP_EOD: + case OP_EODN: + case OP_CIRC: + case OP_DOLL: + case OP_NOT_WORD_BOUNDARY: + case OP_WORD_BOUNDARY: + cc += _pcre_OP_lengths[*cc]; + break; + + /* Skip over a subpattern that has a {0} or {0,x} quantifier */ + + case OP_BRAZERO: + case OP_BRAMINZERO: + case OP_SKIPZERO: + cc += _pcre_OP_lengths[*cc]; + do cc += GET(cc, 1); while (*cc == OP_ALT); + cc += 1 + LINK_SIZE; + break; + + /* Handle literal characters and + repetitions */ + + case OP_CHAR: + case OP_CHARNC: + case OP_NOT: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + case OP_NOTPLUS: + case OP_NOTMINPLUS: + case OP_NOTPOSPLUS: + branchlength++; + cc += 2; +#ifdef SUPPORT_UTF8 + if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; +#endif + break; + + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEPOSPLUS: + branchlength++; + cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; + break; + + /* Handle exact repetitions. The count is already in characters, but we + need to skip over a multibyte character in UTF8 mode. */ + + case OP_EXACT: + case OP_NOTEXACT: + branchlength += GET2(cc,1); + cc += 4; +#ifdef SUPPORT_UTF8 + if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; +#endif + break; + + case OP_TYPEEXACT: + branchlength += GET2(cc,1); + cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4; + break; + + /* Handle single-char non-literal matchers */ + + case OP_PROP: + case OP_NOTPROP: + cc += 2; + /* Fall through */ + + case OP_NOT_DIGIT: + case OP_DIGIT: + case OP_NOT_WHITESPACE: + case OP_WHITESPACE: + case OP_NOT_WORDCHAR: + case OP_WORDCHAR: + case OP_ANY: + case OP_ALLANY: + case OP_EXTUNI: + case OP_HSPACE: + case OP_NOT_HSPACE: + case OP_VSPACE: + case OP_NOT_VSPACE: + branchlength++; + cc++; + break; + + /* "Any newline" might match two characters */ + + case OP_ANYNL: + branchlength += 2; + cc++; + break; + + /* The single-byte matcher means we can't proceed in UTF-8 mode */ + + case OP_ANYBYTE: +#ifdef SUPPORT_UTF8 + if (utf8) return -1; +#endif + branchlength++; + cc++; + break; + + /* For repeated character types, we have to test for \p and \P, which have + an extra two bytes of parameters. */ + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSQUERY: + if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; + cc += _pcre_OP_lengths[op]; + break; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEPOSUPTO: + if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; + cc += _pcre_OP_lengths[op]; + break; + + /* Check a class for variable quantification */ + +#ifdef SUPPORT_UTF8 + case OP_XCLASS: + cc += GET(cc, 1) - 33; + /* Fall through */ +#endif + + case OP_CLASS: + case OP_NCLASS: + cc += 33; + + switch (*cc) + { + case OP_CRPLUS: + case OP_CRMINPLUS: + branchlength++; + /* Fall through */ + + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + cc++; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + branchlength += GET2(cc,1); + cc += 5; + break; + + default: + branchlength++; + break; + } + break; + + /* Backreferences and subroutine calls are treated in the same way: we find + the minimum length for the subpattern. A recursion, however, causes an + a flag to be set that causes the length of this branch to be ignored. The + logic is that a recursion can only make sense if there is another + alternation that stops the recursing. That will provide the minimum length + (when no recursion happens). A backreference within the group that it is + referencing behaves in the same way. + + If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket + matches an empty string (by default it causes a matching failure), so in + that case we must set the minimum length to zero. */ + + case OP_REF: + if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) + { + ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1)); + if (cs == NULL) return -2; + do ce += GET(ce, 1); while (*ce == OP_ALT); + if (cc > cs && cc < ce) + { + d = 0; + had_recurse = TRUE; + } + else d = find_minlength(cs, startcode, options); + } + else d = 0; + cc += 3; + + /* Handle repeated back references */ + + switch (*cc) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + min = 0; + cc++; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + min = GET2(cc, 1); + cc += 5; + break; + + default: + min = 1; + break; + } + + branchlength += min * d; + break; + + case OP_RECURSE: + cs = ce = (uschar *)startcode + GET(cc, 1); + if (cs == NULL) return -2; + do ce += GET(ce, 1); while (*ce == OP_ALT); + if (cc > cs && cc < ce) + had_recurse = TRUE; + else + branchlength += find_minlength(cs, startcode, options); + cc += 1 + LINK_SIZE; + break; + + /* Anything else does not or need not match a character. We can get the + item's length from the table, but for those that can match zero occurrences + of a character, we must take special action for UTF-8 characters. */ + + case OP_UPTO: + case OP_NOTUPTO: + case OP_MINUPTO: + case OP_NOTMINUPTO: + case OP_POSUPTO: + case OP_STAR: + case OP_MINSTAR: + case OP_NOTMINSTAR: + case OP_POSSTAR: + case OP_NOTPOSSTAR: + case OP_QUERY: + case OP_MINQUERY: + case OP_NOTMINQUERY: + case OP_POSQUERY: + case OP_NOTPOSQUERY: + cc += _pcre_OP_lengths[op]; +#ifdef SUPPORT_UTF8 + if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; +#endif + break; + + /* For the record, these are the opcodes that are matched by "default": + OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP, + OP_THEN. */ + + default: + cc += _pcre_OP_lengths[op]; + break; + } + } +/* Control never gets here */ +} + + + /************************************************* * Set a bit and maybe its alternate case * *************************************************/ @@ -500,13 +858,15 @@ Arguments: set NULL unless error Returns: pointer to a pcre_extra block, with study_data filled in and the - appropriate flag set; + appropriate flags set; NULL on error or if no optimization possible */ PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION pcre_study(const pcre *external_re, int options, const char **errorptr) { +int min; +BOOL bits_set = FALSE; uschar start_bits[32]; pcre_extra *extra; pcre_study_data *study; @@ -533,30 +893,39 @@ code = (uschar *)re + re->name_table_offset + (re->name_count * re->name_entry_size); /* For an anchored pattern, or an unanchored pattern that has a first char, or -a multiline pattern that matches only at "line starts", no further processing -at present. */ +a multiline pattern that matches only at "line starts", there is no point in +seeking a list of starting bytes. */ -if ((re->options & PCRE_ANCHORED) != 0 || - (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) - return NULL; +if ((re->options & PCRE_ANCHORED) == 0 && + (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0) + { + /* Set the character tables in the block that is passed around */ -/* Set the character tables in the block that is passed around */ + tables = re->tables; + if (tables == NULL) + (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, + (void *)(&tables)); -tables = re->tables; -if (tables == NULL) - (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, - (void *)(&tables)); + compile_block.lcc = tables + lcc_offset; + compile_block.fcc = tables + fcc_offset; + compile_block.cbits = tables + cbits_offset; + compile_block.ctypes = tables + ctypes_offset; -compile_block.lcc = tables + lcc_offset; -compile_block.fcc = tables + fcc_offset; -compile_block.cbits = tables + cbits_offset; -compile_block.ctypes = tables + ctypes_offset; + /* See if we can find a fixed set of initial characters for the pattern. */ -/* See if we can find a fixed set of initial characters for the pattern. */ + memset(start_bits, 0, 32 * sizeof(uschar)); + bits_set = set_start_bits(code, start_bits, + (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0, + &compile_block) == SSB_DONE; + } -memset(start_bits, 0, 32 * sizeof(uschar)); -if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, - (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL; +/* Find the minimum length of subject string. */ + +min = find_minlength(code, code, re->options); + +/* Return NULL if no optimization is possible. */ + +if (!bits_set && min < 0) return NULL; /* Get a pcre_extra block and a pcre_study_data block. The study data is put in the latter, which is pointed to by the former, which may also get additional @@ -579,8 +948,19 @@ extra->flags = PCRE_EXTRA_STUDY_DATA; extra->study_data = study; study->size = sizeof(pcre_study_data); -study->options = PCRE_STUDY_MAPPED; -memcpy(study->start_bits, start_bits, sizeof(start_bits)); +study->flags = 0; + +if (bits_set) + { + study->flags |= PCRE_STUDY_MAPPED; + memcpy(study->start_bits, start_bits, sizeof(start_bits)); + } + +if (min >= 0) + { + study->flags |= PCRE_STUDY_MINLEN; + study->minlength = min; + } return extra; } diff --git a/harbour/external/pcre/pcretryf.c b/harbour/external/pcre/pcretryf.c index 4da2a1ad66..66bf5c78a7 100644 --- a/harbour/external/pcre/pcretryf.c +++ b/harbour/external/pcre/pcretryf.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2008 University of Cambridge + Copyright (c) 1997-2009 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -128,7 +128,9 @@ if (study != NULL) { *internal_study = *study; /* To copy other fields */ internal_study->size = byteflip(study->size, sizeof(study->size)); - internal_study->options = byteflip(study->options, sizeof(study->options)); + internal_study->flags = byteflip(study->flags, sizeof(study->flags)); + internal_study->minlength = byteflip(study->minlength, + sizeof(study->minlength)); } return internal_re; diff --git a/harbour/external/pcre/pcreucd.c b/harbour/external/pcre/pcreucd.c index 910d6c717c..f3b977a330 100644 --- a/harbour/external/pcre/pcreucd.c +++ b/harbour/external/pcre/pcreucd.c @@ -1,11 +1,28 @@ #ifdef HAVE_CONFIG_H #include "config.h" #endif + #include "pcreinal.h" /* Unicode character database. */ /* This file was autogenerated by the MultiStage2.py script. */ /* Total size: 52808 bytes, block size: 128. */ + +/* The tables herein are needed only when UCP support is built */ +/* into PCRE. This module should not be referenced otherwise, so */ +/* it should not matter whether it is compiled or not. However */ +/* a comment was received about space saving - maybe the guy linked */ +/* all the modules rather than using a library - so we include a */ +/* condition to cut out the tables when not needed. But don't leave */ +/* a totally empty module because some compilers barf at that. */ +/* Instead, just supply small dummy tables. */ + +#ifndef SUPPORT_UCP +const ucd_record _pcre_ucd_records[] = {{0,0,0 }}; +const uschar _pcre_ucd_stage1[] = {0}; +const pcre_uint16 _pcre_ucd_stage2[] = {0}; +#else + /* When recompiling tables with a new Unicode version, please check types in the structure definition from pcre_internal.h: typedef struct { @@ -2608,3 +2625,4 @@ const pcre_uint16 _pcre_ucd_stage2[] = { /* 40448 bytes, block = 128 */ #if UCD_BLOCK_SIZE != 128 #error Please correct UCD_BLOCK_SIZE in pcre_internal.h #endif +#endif /* SUPPORT_UCP */