From b4a3d7bc8cd76cd731fa812abe0436ae81009b09 Mon Sep 17 00:00:00 2001 From: Viktor Szakats Date: Sat, 26 Jun 2010 13:52:28 +0000 Subject: [PATCH] 2010-06-26 15:51 UTC+0200 Viktor Szakats (harbour.01 syenar.hu) * external/pcre/pcreexec.c * external/pcre/pcre.h * external/pcre/pcreinal.h * external/pcre/pcretabs.c * external/pcre/pcredfa.c * external/pcre/pcrecomp.c * external/pcre/pcre.dif * external/pcre/config.h * external/pcre/pcrexcls.c * external/pcre/chartabs.c * external/pcre/pcreprni.h * external/pcre/pcrestud.c * external/pcre/Makefile * PCRE update to 8.10 (from 8.02) ; Thanks to Tamas Tevesz for the patch. This update was done using the new patchup tool. --- harbour/ChangeLog | 18 + harbour/external/pcre/Makefile | 4 +- harbour/external/pcre/chartabs.c | 2 +- harbour/external/pcre/config.h | 9 +- harbour/external/pcre/pcre.dif | 8 +- harbour/external/pcre/pcre.h | 9 +- harbour/external/pcre/pcrecomp.c | 750 ++++++++++++++------- harbour/external/pcre/pcredfa.c | 217 ++++-- harbour/external/pcre/pcreexec.c | 1051 +++++++++++++++++++++--------- harbour/external/pcre/pcreinal.h | 76 ++- harbour/external/pcre/pcreprni.h | 8 + harbour/external/pcre/pcrestud.c | 274 ++++++-- harbour/external/pcre/pcretabs.c | 22 +- harbour/external/pcre/pcrexcls.c | 36 +- 14 files changed, 1795 insertions(+), 689 deletions(-) diff --git a/harbour/ChangeLog b/harbour/ChangeLog index 03338f92cb..ae03155b85 100644 --- a/harbour/ChangeLog +++ b/harbour/ChangeLog @@ -16,6 +16,24 @@ The license applies to all entries newer than 2009-04-28. */ +2010-06-26 15:51 UTC+0200 Viktor Szakats (harbour.01 syenar.hu) + * external/pcre/pcreexec.c + * external/pcre/pcre.h + * external/pcre/pcreinal.h + * external/pcre/pcretabs.c + * external/pcre/pcredfa.c + * external/pcre/pcrecomp.c + * external/pcre/pcre.dif + * external/pcre/config.h + * external/pcre/pcrexcls.c + * external/pcre/chartabs.c + * external/pcre/pcreprni.h + * external/pcre/pcrestud.c + * external/pcre/Makefile + * PCRE update to 8.10 (from 8.02) + ; Thanks to Tamas Tevesz for the patch. This update was done + using the new patchup tool. + 2010-06-26 15:25 UTC+0200 Viktor Szakats (harbour.01 syenar.hu) * utils/hbmk2/hbmk2.prg + -hbimplib mode now respects -clean option. diff --git a/harbour/external/pcre/Makefile b/harbour/external/pcre/Makefile index 08c58c7476..d219cfc688 100644 --- a/harbour/external/pcre/Makefile +++ b/harbour/external/pcre/Makefile @@ -68,8 +68,8 @@ else endif # ORIGIN http://www.pcre.org/ -# VER 8.02 -# URL http://sourceforge.net/projects/pcre/files/pcre/8.02/pcre-8.02.zip/download +# VER 8.10 +# URL http://sourceforge.net/projects/pcre/files/pcre/8.10/pcre-8.10.zip/download # DIFF pcre.dif # # MAP LICENCE diff --git a/harbour/external/pcre/chartabs.c b/harbour/external/pcre/chartabs.c index a0ea6e99ac..1d037e0d69 100644 --- a/harbour/external/pcre/chartabs.c +++ b/harbour/external/pcre/chartabs.c @@ -14,7 +14,7 @@ example ISO-8859-1. When dftables is run, it creates these tables in the current locale. If PCRE is configured with --enable-rebuild-chartables, this happens automatically. -The following #includes are present because without the gcc 4.x may remove the +The following #includes are present because without them gcc 4.x may remove the array definition from the final binary if PCRE is built into a static library and dead code stripping is activated. This leads to link errors. Pulling in the header ensures that the array gets flagged as "someone outside this compilation diff --git a/harbour/external/pcre/config.h b/harbour/external/pcre/config.h index e577bbb604..460c783ba0 100644 --- a/harbour/external/pcre/config.h +++ b/harbour/external/pcre/config.h @@ -250,13 +250,16 @@ them both to 0; an emulation function will be used. */ #define PACKAGE_NAME "PCRE" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "PCRE 8.02" +#define PACKAGE_STRING "PCRE 8.10" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "pcre" +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + /* Define to the version of this package. */ -#define PACKAGE_VERSION "8.02" +#define PACKAGE_VERSION "8.10" /* If you are compiling for a system other than a Unix-like system or @@ -312,7 +315,7 @@ them both to 0; an emulation function will be used. */ /* Version number of package */ #ifndef VERSION -#define VERSION "8.02" +#define VERSION "8.10" #endif /* Define to empty if `const' does not conform to ANSI C. */ diff --git a/harbour/external/pcre/pcre.dif b/harbour/external/pcre/pcre.dif index cdc837273b..85b99e0032 100644 --- a/harbour/external/pcre/pcre.dif +++ b/harbour/external/pcre/pcre.dif @@ -1,6 +1,6 @@ diff -urN pcre.orig/pcrefinf.c pcre/pcrefinf.c ---- pcre.orig/pcrefinf.c 2010-06-13 18:04:52.000000000 +0200 -+++ pcre/pcrefinf.c 2010-06-11 04:09:57.000000000 +0200 +--- pcre.orig/pcrefinf.c 2010-06-26 14:10:17.887330037 +0200 ++++ pcre/pcrefinf.c 2010-06-26 14:10:18.107331394 +0200 @@ -126,7 +126,7 @@ case PCRE_INFO_MINLENGTH: *((int *)where) = @@ -11,8 +11,8 @@ diff -urN pcre.orig/pcrefinf.c pcre/pcrefinf.c case PCRE_INFO_LASTLITERAL: diff -urN pcre.orig/pcreglob.c pcre/pcreglob.c ---- pcre.orig/pcreglob.c 2010-06-13 18:04:52.000000000 +0200 -+++ pcre/pcreglob.c 2010-06-11 04:09:57.000000000 +0200 +--- pcre.orig/pcreglob.c 2010-06-26 14:10:17.907330491 +0200 ++++ pcre/pcreglob.c 2010-06-26 14:10:18.107331394 +0200 @@ -74,11 +74,17 @@ PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL; diff --git a/harbour/external/pcre/pcre.h b/harbour/external/pcre/pcre.h index 890b13de65..febb617422 100644 --- a/harbour/external/pcre/pcre.h +++ b/harbour/external/pcre/pcre.h @@ -5,7 +5,7 @@ /* This is the public header file for the PCRE library, to be #included by applications that call the PCRE functions. - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2010 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE. /* The current PCRE version information. */ #define PCRE_MAJOR 8 -#define PCRE_MINOR 02 +#define PCRE_MINOR 10 #define PCRE_PRERELEASE -#define PCRE_DATE 2010-03-19 +#define PCRE_DATE 2010-06-25 /* When an application links to a PCRE DLL in Windows, the symbols that are imported have to be identified as such. When building PCRE, the appropriate @@ -131,6 +131,7 @@ both, so we keep them all distinct. */ #define PCRE_NO_START_OPTIMISE 0x04000000 #define PCRE_PARTIAL_HARD 0x08000000 #define PCRE_NOTEMPTY_ATSTART 0x10000000 +#define PCRE_UCP 0x20000000 /* Exec-time and get/set-time error codes */ @@ -200,6 +201,7 @@ these bits, just add new ones on the end, in order to remain compatible. */ #define PCRE_EXTRA_CALLOUT_DATA 0x0004 #define PCRE_EXTRA_TABLES 0x0008 #define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010 +#define PCRE_EXTRA_MARK 0x0020 /* Types */ @@ -225,6 +227,7 @@ typedef struct pcre_extra { void *callout_data; /* Data passed back in callouts */ const unsigned char *tables; /* Pointer to character tables */ unsigned long int match_limit_recursion; /* Max recursive calls to match() */ + unsigned char **mark; /* For passing back a mark pointer */ } pcre_extra; /* The structure for passing out data via the pcre_callout_function. We use a diff --git a/harbour/external/pcre/pcrecomp.c b/harbour/external/pcre/pcrecomp.c index 7c722c4750..18e57715fa 100644 --- a/harbour/external/pcre/pcrecomp.c +++ b/harbour/external/pcre/pcrecomp.c @@ -124,7 +124,7 @@ static const short int escapes[] = { -ESC_H, 0, 0, -ESC_K, 0, 0, - 0, 0, + -ESC_N, 0, -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, @@ -171,7 +171,7 @@ static const short int escapes[] = { /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, -/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P, +/* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P, /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, @@ -188,11 +188,14 @@ string is built from string macros so that it works in UTF-8 mode on EBCDIC platforms. */ typedef struct verbitem { - int len; - int op; + int len; /* Length of verb name */ + int op; /* Op when no arg, or -1 if arg mandatory */ + int op_arg; /* Op when arg present, or -1 if not allowed */ } verbitem; static const char verbnames[] = + "\0" /* Empty name is a shorthand for MARK */ + STRING_MARK0 STRING_ACCEPT0 STRING_COMMIT0 STRING_F0 @@ -202,13 +205,15 @@ static const char verbnames[] = STRING_THEN; static const verbitem verbs[] = { - { 6, OP_ACCEPT }, - { 6, OP_COMMIT }, - { 1, OP_FAIL }, - { 4, OP_FAIL }, - { 5, OP_PRUNE }, - { 4, OP_SKIP }, - { 4, OP_THEN } + { 0, -1, OP_MARK }, + { 4, -1, OP_MARK }, + { 6, OP_ACCEPT, -1 }, + { 6, OP_COMMIT, -1 }, + { 1, OP_FAIL, -1 }, + { 4, OP_FAIL, -1 }, + { 5, OP_PRUNE, OP_PRUNE_ARG }, + { 4, OP_SKIP, OP_SKIP_ARG }, + { 4, OP_THEN, OP_THEN_ARG } }; static const int verbcount = sizeof(verbs)/sizeof(verbitem); @@ -256,6 +261,53 @@ static const int posix_class_maps[] = { cbit_xdigit,-1, 0 /* xdigit */ }; +/* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class +substitutes must be in the order of the names, defined above, and there are +both positive and negative cases. NULL means no substitute. */ + +#ifdef SUPPORT_UCP +static const uschar *substitutes[] = { + (uschar *)"\\P{Nd}", /* \D */ + (uschar *)"\\p{Nd}", /* \d */ + (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */ + (uschar *)"\\p{Xsp}", /* \s */ + (uschar *)"\\P{Xwd}", /* \W */ + (uschar *)"\\p{Xwd}" /* \w */ +}; + +static const uschar *posix_substitutes[] = { + (uschar *)"\\p{L}", /* alpha */ + (uschar *)"\\p{Ll}", /* lower */ + (uschar *)"\\p{Lu}", /* upper */ + (uschar *)"\\p{Xan}", /* alnum */ + NULL, /* ascii */ + (uschar *)"\\h", /* blank */ + NULL, /* cntrl */ + (uschar *)"\\p{Nd}", /* digit */ + NULL, /* graph */ + NULL, /* print */ + NULL, /* punct */ + (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */ + (uschar *)"\\p{Xwd}", /* word */ + NULL, /* xdigit */ + /* Negated cases */ + (uschar *)"\\P{L}", /* ^alpha */ + (uschar *)"\\P{Ll}", /* ^lower */ + (uschar *)"\\P{Lu}", /* ^upper */ + (uschar *)"\\P{Xan}", /* ^alnum */ + NULL, /* ^ascii */ + (uschar *)"\\H", /* ^blank */ + NULL, /* ^cntrl */ + (uschar *)"\\P{Nd}", /* ^digit */ + NULL, /* ^graph */ + NULL, /* ^print */ + NULL, /* ^punct */ + (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */ + (uschar *)"\\P{Xwd}", /* ^word */ + NULL /* ^xdigit */ +}; +#define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *)) +#endif #define STRING(a) # a #define XSTRING(s) STRING(s) @@ -319,7 +371,7 @@ static const char error_texts[] = /* 35 */ "invalid condition (?(0)\0" "\\C not allowed in lookbehind assertion\0" - "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0" + "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0" "number after (?C is > 255\0" "closing ) for (?C expected\0" /* 40 */ @@ -345,7 +397,7 @@ static const char error_texts[] = "inconsistent NEWLINE options\0" "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" "a numbered reference must not be zero\0" - "(*VERB) with an argument is not supported\0" + "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0" /* 60 */ "(*VERB) not recognized\0" "number is too big\0" @@ -353,7 +405,10 @@ static const char error_texts[] = "digit expected after (?+\0" "] is an invalid data character in JavaScript compatibility mode\0" /* 65 */ - "different names for subpatterns of the same number are not allowed\0"; + "different names for subpatterns of the same number are not allowed\0" + "(*MARK) must have an argument\0" + "this version of PCRE is not compiled with PCRE_UCP support\0" + ; /* Table to identify digits and hex digits. This is used when compiling patterns. Note that the tables in chartables are dependent on the locale, and @@ -586,7 +641,6 @@ else case CHAR_l: case CHAR_L: - case CHAR_N: case CHAR_u: case CHAR_U: *errorcodeptr = ERR37; @@ -824,6 +878,19 @@ else } } +/* Perl supports \N{name} for character names, as well as plain \N for "not +newline". PCRE does not support \N{name}. */ + +if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET) + *errorcodeptr = ERR37; + +/* If PCRE_UCP is set, we change the values for \d etc. */ + +if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w) + c -= (ESC_DU - ESC_D); + +/* Set the pointer to the final character before returning. */ + *ptrptr = ptr; return c; } @@ -1062,25 +1129,39 @@ dealing with. The very first call may not start with a parenthesis. */ if (ptr[0] == CHAR_LEFT_PARENTHESIS) { - if (ptr[1] == CHAR_QUESTION_MARK && - ptr[2] == CHAR_VERTICAL_LINE) - { - ptr += 3; - dup_parens = TRUE; - } + /* Handle specials such as (*SKIP) or (*UTF8) etc. */ - /* Handle a normal, unnamed capturing parenthesis */ + if (ptr[1] == CHAR_ASTERISK) ptr += 2; - else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK) + /* Handle a normal, unnamed capturing parenthesis. */ + + else if (ptr[1] != CHAR_QUESTION_MARK) { *count += 1; if (name == NULL && *count == lorn) return *count; ptr++; } + /* All cases now have (? at the start. Remember when we are in a group + where the parenthesis numbers are duplicated. */ + + else if (ptr[2] == CHAR_VERTICAL_LINE) + { + ptr += 3; + dup_parens = TRUE; + } + + /* Handle comments; all characters are allowed until a ket is reached. */ + + else if (ptr[2] == CHAR_NUMBER_SIGN) + { + for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break; + goto FAIL_EXIT; + } + /* Handle a condition. If it is an assertion, just carry on so that it is processed as normal. If not, skip to the closing parenthesis of the - condition (there can't be any nested parens. */ + condition (there can't be any nested parens). */ else if (ptr[2] == CHAR_LEFT_PARENTHESIS) { @@ -1092,7 +1173,7 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS) } } - /* We have either (? or (* and not a condition */ + /* Start with (? but not a condition. */ else { @@ -1214,8 +1295,7 @@ for (; *ptr != 0; ptr++) else if (*ptr == CHAR_RIGHT_PARENTHESIS) { if (dup_parens && *count < hwm_count) *count = hwm_count; - *ptrptr = ptr; - return -1; + goto FAIL_EXIT; } else if (*ptr == CHAR_VERTICAL_LINE && dup_parens) @@ -1615,7 +1695,8 @@ for (;;) /* Otherwise, we can get the item's length from the table, except that for repeated character types, we have to test for \p and \P, which have an extra - two bytes of parameters. */ + two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we + must add in its length. */ else { @@ -1639,6 +1720,13 @@ for (;;) case OP_TYPEPOSUPTO: if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; break; + + case OP_MARK: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + code += code[1]; + break; } /* Add in the fixed length from the table */ @@ -1710,7 +1798,8 @@ for (;;) /* Otherwise, we can get the item's length from the table, except that for repeated character types, we have to test for \p and \P, which have an extra - two bytes of parameters. */ + two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we + must add in its length. */ else { @@ -1734,6 +1823,13 @@ for (;;) case OP_TYPEEXACT: if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; break; + + case OP_MARK: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + code += code[1]; + break; } /* Add in the fixed length from the table */ @@ -2003,6 +2099,16 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE break; #endif + /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument + string. */ + + case OP_MARK: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + code += code[1]; + break; + /* None of the remaining opcodes are required to match a character. */ default: @@ -2223,8 +2329,8 @@ auto_callout(uschar *code, const uschar *ptr, compile_data *cd) { *code++ = OP_CALLOUT; *code++ = 255; -PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */ -PUT(code, LINK_SIZE, 0); /* Default length */ +PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */ +PUT(code, LINK_SIZE, 0); /* Default length */ return code + 2*LINK_SIZE; } @@ -2249,7 +2355,7 @@ Returns: nothing static void complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd) { -int length = ptr - cd->start_pattern - GET(previous_callout, 2); +int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2)); PUT(previous_callout, 2 + LINK_SIZE, length); } @@ -2299,6 +2405,69 @@ for (++c; c <= d; c++) return TRUE; } + + + +/************************************************* +* Check a character and a property * +*************************************************/ + +/* This function is called by check_auto_possessive() when a property item +is adjacent to a fixed character. + +Arguments: + c the character + ptype the property type + pdata the data for the type + negated TRUE if it's a negated property (\P or \p{^) + +Returns: TRUE if auto-possessifying is OK +*/ + +static BOOL +check_char_prop(int c, int ptype, int pdata, BOOL negated) +{ +const ucd_record *prop = GET_UCD(c); +switch(ptype) + { + case PT_LAMP: + return (prop->chartype == ucp_Lu || + prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt) == negated; + + case PT_GC: + return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated; + + case PT_PC: + return (pdata == prop->chartype) == negated; + + case PT_SC: + return (pdata == prop->script) == negated; + + /* These are specials */ + + case PT_ALNUM: + return (_pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated; + + case PT_SPACE: /* Perl space */ + return (_pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) + == negated; + + case PT_PXSPACE: /* POSIX space */ + return (_pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR) + == negated; + + case PT_WORD: + return (_pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N || + c == CHAR_UNDERSCORE) == negated; + } +return FALSE; +} #endif /* SUPPORT_UCP */ @@ -2312,10 +2481,8 @@ whether the next thing could possibly match the repeated item. If not, it makes sense to automatically possessify the repeated item. Arguments: - op_code the repeated op code - this data for this item, depends on the opcode + previous pointer to the repeated opcode utf8 TRUE in UTF-8 mode - utf8_char used for utf8 character bytes, NULL if not relevant ptr next character in pattern options options bits cd contains pointers to tables etc. @@ -2324,10 +2491,11 @@ Returns: TRUE if possessifying is wanted */ static BOOL -check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, - const uschar *ptr, int options, compile_data *cd) +check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr, + int options, compile_data *cd) { -int next; +int c, next; +int op_code = *previous++; /* Skip whitespace and comments in extended mode */ @@ -2388,23 +2556,18 @@ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK || strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) return FALSE; -/* Now compare the next item with the previous opcode. If the previous is a -positive single character match, "item" either contains the character or, if -"item" is greater than 127 in utf8 mode, the character's bytes are in -utf8_char. */ - - -/* Handle cases when the next item is a character. */ +/* Now compare the next item with the previous opcode. First, handle cases when +the next item is a character. */ if (next >= 0) switch(op_code) { case OP_CHAR: #ifdef SUPPORT_UTF8 - if (utf8 && item > 127) { GETCHAR(item, utf8_char); } + GETCHARTEST(c, previous); #else - (void)(utf8_char); /* Keep compiler happy by referencing function argument */ + c = *previous; #endif - return item != next; + return c != next; /* For CHARNC (caseless character) we must check the other case. If we have Unicode property support, we can use it to test the other case of @@ -2412,9 +2575,11 @@ if (next >= 0) switch(op_code) case OP_CHARNC: #ifdef SUPPORT_UTF8 - if (utf8 && item > 127) { GETCHAR(item, utf8_char); } + GETCHARTEST(c, previous); +#else + c = *previous; #endif - if (item == next) return FALSE; + if (c == next) return FALSE; #ifdef SUPPORT_UTF8 if (utf8) { @@ -2425,16 +2590,16 @@ if (next >= 0) switch(op_code) #else othercase = NOTACHAR; #endif - return (unsigned int)item != othercase; + return (unsigned int)c != othercase; } else #endif /* SUPPORT_UTF8 */ - return (item != cd->fcc[next]); /* Non-UTF-8 mode */ + return (c != cd->fcc[next]); /* Non-UTF-8 mode */ - /* For OP_NOT, "item" must be a single-byte character. */ + /* For OP_NOT, its data is always a single-byte character. */ case OP_NOT: - if (item == next) return TRUE; + if ((c = *previous) == next) return TRUE; if ((options & PCRE_CASELESS) == 0) return FALSE; #ifdef SUPPORT_UTF8 if (utf8) @@ -2446,11 +2611,14 @@ if (next >= 0) switch(op_code) #else othercase = NOTACHAR; #endif - return (unsigned int)item == othercase; + return (unsigned int)c == othercase; } else #endif /* SUPPORT_UTF8 */ - return (item == cd->fcc[next]); /* Non-UTF-8 mode */ + return (c == cd->fcc[next]); /* Non-UTF-8 mode */ + + /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. + When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ case OP_DIGIT: return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; @@ -2493,11 +2661,12 @@ if (next >= 0) switch(op_code) case 0x202f: case 0x205f: case 0x3000: - return op_code != OP_HSPACE; + return op_code == OP_NOT_HSPACE; default: - return op_code == OP_HSPACE; + return op_code != OP_NOT_HSPACE; } + case OP_ANYNL: case OP_VSPACE: case OP_NOT_VSPACE: switch(next) @@ -2509,48 +2678,62 @@ if (next >= 0) switch(op_code) case 0x85: case 0x2028: case 0x2029: - return op_code != OP_VSPACE; + return op_code == OP_NOT_VSPACE; default: - return op_code == OP_VSPACE; + return op_code != OP_NOT_VSPACE; } +#ifdef SUPPORT_UCP + case OP_PROP: + return check_char_prop(next, previous[0], previous[1], FALSE); + + case OP_NOTPROP: + return check_char_prop(next, previous[0], previous[1], TRUE); +#endif + default: return FALSE; } -/* Handle the case when the next item is \d, \s, etc. */ +/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP +is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are +generated only when PCRE_UCP is *not* set, that is, when only ASCII +characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are +replaced by OP_PROP codes when PCRE_UCP is set. */ switch(op_code) { case OP_CHAR: case OP_CHARNC: #ifdef SUPPORT_UTF8 - if (utf8 && item > 127) { GETCHAR(item, utf8_char); } + GETCHARTEST(c, previous); +#else + c = *previous; #endif switch(-next) { case ESC_d: - return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; + return c > 127 || (cd->ctypes[c] & ctype_digit) == 0; case ESC_D: - return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; + return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0; case ESC_s: - return item > 127 || (cd->ctypes[item] & ctype_space) == 0; + return c > 127 || (cd->ctypes[c] & ctype_space) == 0; case ESC_S: - return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; + return c <= 127 && (cd->ctypes[c] & ctype_space) != 0; case ESC_w: - return item > 127 || (cd->ctypes[item] & ctype_word) == 0; + return c > 127 || (cd->ctypes[c] & ctype_word) == 0; case ESC_W: - return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; + return c <= 127 && (cd->ctypes[c] & ctype_word) != 0; case ESC_h: case ESC_H: - switch(item) + switch(c) { case 0x09: case 0x20: @@ -2578,7 +2761,7 @@ switch(op_code) case ESC_v: case ESC_V: - switch(item) + switch(c) { case 0x0a: case 0x0b: @@ -2592,38 +2775,92 @@ switch(op_code) return -next == ESC_v; } + /* When PCRE_UCP is set, these values get generated for \d etc. Find + their substitutions and process them. The result will always be either + -ESC_p or -ESC_P. Then fall through to process those values. */ + +#ifdef SUPPORT_UCP + case ESC_du: + case ESC_DU: + case ESC_wu: + case ESC_WU: + case ESC_su: + case ESC_SU: + { + int temperrorcode = 0; + ptr = substitutes[-next - ESC_DU]; + next = check_escape(&ptr, &temperrorcode, 0, options, FALSE); + if (temperrorcode != 0) return FALSE; + ptr++; /* For compatibility */ + } + /* Fall through */ + + case ESC_p: + case ESC_P: + { + int ptype, pdata, errorcodeptr; + BOOL negated; + + ptr--; /* Make ptr point at the p or P */ + ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr); + if (ptype < 0) return FALSE; + ptr++; /* Point past the final curly ket */ + + /* If the property item is optional, we have to give up. (When generated + from \d etc by PCRE_UCP, this test will have been applied much earlier, + to the original \d etc. At this point, ptr will point to a zero byte. */ + + if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK || + strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) + return FALSE; + + /* Do the property check. */ + + return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated); + } +#endif + default: return FALSE; } + /* In principle, support for Unicode properties should be integrated here as + well. It means re-organizing the above code so as to get hold of the property + values before switching on the op-code. However, I wonder how many patterns + combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set, + these op-codes are never generated.) */ + case OP_DIGIT: return next == -ESC_D || next == -ESC_s || next == -ESC_W || - next == -ESC_h || next == -ESC_v; + next == -ESC_h || next == -ESC_v || next == -ESC_R; case OP_NOT_DIGIT: return next == -ESC_d; case OP_WHITESPACE: - return next == -ESC_S || next == -ESC_d || next == -ESC_w; + return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R; case OP_NOT_WHITESPACE: return next == -ESC_s || next == -ESC_h || next == -ESC_v; case OP_HSPACE: - return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w; + return next == -ESC_S || next == -ESC_H || next == -ESC_d || + next == -ESC_w || next == -ESC_v || next == -ESC_R; case OP_NOT_HSPACE: return next == -ESC_h; /* Can't have \S in here because VT matches \S (Perl anomaly) */ + case OP_ANYNL: case OP_VSPACE: return next == -ESC_V || next == -ESC_d || next == -ESC_w; case OP_NOT_VSPACE: - return next == -ESC_v; + return next == -ESC_v || next == -ESC_R; case OP_WORDCHAR: - return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v; + return next == -ESC_W || next == -ESC_s || next == -ESC_h || + next == -ESC_v || next == -ESC_R; case OP_NOT_WORDCHAR: return next == -ESC_w || next == -ESC_d; @@ -2687,6 +2924,7 @@ BOOL inescq = FALSE; BOOL groupsetfirstbyte = FALSE; const uschar *ptr = *ptrptr; const uschar *tempptr; +const uschar *nestptr = NULL; uschar *previous = NULL; uschar *previous_callout = NULL; uschar *save_hwm = NULL; @@ -2757,6 +2995,16 @@ for (;; ptr++) c = *ptr; + /* If we are at the end of a nested substitution, revert to the outer level + string. Nesting only happens one level deep. */ + + if (c == 0 && nestptr != NULL) + { + ptr = nestptr; + nestptr = NULL; + c = *ptr; + } + /* If we are in the pre-compile phase, accumulate the length used for the previous cycle of this loop. */ @@ -2787,7 +3035,7 @@ for (;; ptr++) goto FAILED; } - *lengthptr += code - last_code; + *lengthptr += (int)(code - last_code); DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); /* If "previous" is set and it is not at the start of the work space, move @@ -2905,7 +3153,7 @@ for (;; ptr++) *errorcodeptr = ERR20; goto FAILED; } - *lengthptr += code - last_code; /* To include callout length */ + *lengthptr += (int)(code - last_code); /* To include callout length */ DPRINTF((">> end branch\n")); } return TRUE; @@ -3110,7 +3358,7 @@ for (;; ptr++) ptr++; } - posix_class = check_posix_name(ptr, tempptr - ptr); + posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); if (posix_class < 0) { *errorcodeptr = ERR30; @@ -3124,10 +3372,25 @@ for (;; ptr++) if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) posix_class = 0; - /* We build the bit map for the POSIX class in a chunk of local store - because we may be adding and subtracting from it, and we don't want to - subtract bits that may be in the main map already. At the end we or the - result into the bit map that is being built. */ + /* When PCRE_UCP is set, some of the POSIX classes are converted to + different escape sequences that use Unicode properties. */ + +#ifdef SUPPORT_UCP + if ((options & PCRE_UCP) != 0) + { + int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0); + if (posix_substitutes[pc] != NULL) + { + nestptr = tempptr + 1; + ptr = posix_substitutes[pc] - 1; + continue; + } + } +#endif + /* In the non-UCP case, we build the bit map for the POSIX class in a + chunk of local store because we may be adding and subtracting from it, + and we don't want to subtract bits that may be in the main map already. + At the end we or the result into the bit map that is being built. */ posix_class *= 3; @@ -3171,19 +3434,18 @@ for (;; ptr++) /* Backslash may introduce a single character, or it may introduce one of the specials, which just set a flag. The sequence \b is a special - case. Inside a class (and only there) it is treated as backspace. - Elsewhere it marks a word boundary. Other escapes have preset maps ready - to 'or' into the one we are building. We assume they have more than one - character in them, so set class_charcount bigger than one. */ + case. Inside a class (and only there) it is treated as backspace. We + assume that other escapes have more than one character in them, so set + class_charcount bigger than one. Unrecognized escapes fall through and + are either treated as literal characters (by default), or are faulted if + PCRE_EXTRA is set. */ if (c == CHAR_BACKSLASH) { c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; - if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ - else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */ - else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */ + if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ else if (-c == ESC_Q) /* Handle start of quoted string */ { if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) @@ -3200,10 +3462,20 @@ for (;; ptr++) register const uschar *cbits = cd->cbits; class_charcount += 2; /* Greater than 1 is what matters */ - /* Save time by not doing this in the pre-compile phase. */ - - if (lengthptr == NULL) switch (-c) + switch (-c) { +#ifdef SUPPORT_UCP + case ESC_du: /* These are the values given for \d etc */ + case ESC_DU: /* when PCRE_UCP is set. We replace the */ + case ESC_wu: /* escape sequence with an appropriate \p */ + case ESC_WU: /* or \P to test Unicode properties instead */ + case ESC_su: /* of the default ASCII testing. */ + case ESC_SU: + nestptr = ptr; + ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */ + class_charcount -= 2; /* Undo! */ + continue; +#endif case ESC_d: for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; continue; @@ -3233,20 +3505,7 @@ for (;; ptr++) classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ continue; - default: /* Not recognized; fall through */ - break; /* Need "default" setting to stop compiler warning. */ - } - - /* In the pre-compile phase, just do the recognition. */ - - else if (c == -ESC_d || c == -ESC_D || c == -ESC_w || - c == -ESC_W || c == -ESC_s || c == -ESC_S) continue; - - /* We need to deal with \H, \h, \V, and \v in both phases because - they use extra memory. */ - - if (-c == ESC_h) - { + case ESC_h: SETBIT(classbits, 0x09); /* VT */ SETBIT(classbits, 0x20); /* SPACE */ SETBIT(classbits, 0xa0); /* NSBP */ @@ -3270,10 +3529,8 @@ for (;; ptr++) } #endif continue; - } - if (-c == ESC_H) - { + case ESC_H: for (c = 0; c < 32; c++) { int x = 0xff; @@ -3315,10 +3572,8 @@ for (;; ptr++) } #endif continue; - } - if (-c == ESC_v) - { + case ESC_v: SETBIT(classbits, 0x0a); /* LF */ SETBIT(classbits, 0x0b); /* VT */ SETBIT(classbits, 0x0c); /* FF */ @@ -3334,10 +3589,8 @@ for (;; ptr++) } #endif continue; - } - if (-c == ESC_V) - { + case ESC_V: for (c = 0; c < 32; c++) { int x = 0xff; @@ -3367,38 +3620,38 @@ for (;; ptr++) } #endif continue; - } - - /* We need to deal with \P and \p in both phases. */ #ifdef SUPPORT_UCP - if (-c == ESC_p || -c == ESC_P) - { - BOOL negated; - int pdata; - int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); - if (ptype < 0) goto FAILED; - class_utf8 = TRUE; - *class_utf8data++ = ((-c == ESC_p) != negated)? - XCL_PROP : XCL_NOTPROP; - *class_utf8data++ = ptype; - *class_utf8data++ = pdata; - class_charcount -= 2; /* Not a < 256 character */ - continue; - } + case ESC_p: + case ESC_P: + { + BOOL negated; + int pdata; + int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); + if (ptype < 0) goto FAILED; + class_utf8 = TRUE; + *class_utf8data++ = ((-c == ESC_p) != negated)? + XCL_PROP : XCL_NOTPROP; + *class_utf8data++ = ptype; + *class_utf8data++ = pdata; + class_charcount -= 2; /* Not a < 256 character */ + continue; + } #endif - /* Unrecognized escapes are faulted if PCRE is running in its - strict mode. By default, for compatibility with Perl, they are - treated as literals. */ + /* Unrecognized escapes are faulted if PCRE is running in its + strict mode. By default, for compatibility with Perl, they are + treated as literals. */ - if ((options & PCRE_EXTRA) != 0) - { - *errorcodeptr = ERR7; - goto FAILED; + default: + if ((options & PCRE_EXTRA) != 0) + { + *errorcodeptr = ERR7; + goto FAILED; + } + class_charcount -= 2; /* Undo the default count from above */ + c = *ptr; /* Get the final character and fall through */ + break; } - - class_charcount -= 2; /* Undo the default count from above */ - c = *ptr; /* Get the final character and fall through */ } /* Fall through if we have a single character (c >= 0). This may be @@ -3468,14 +3721,11 @@ for (;; ptr++) d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; - /* \b is backspace; \X is literal X; \R is literal R; any other - special means the '-' was literal */ + /* \b is backspace; any other special means the '-' was literal */ if (d < 0) { - if (d == -ESC_b) d = CHAR_BS; - else if (d == -ESC_X) d = CHAR_X; - else if (d == -ESC_R) d = CHAR_R; else + if (d == -ESC_b) d = CHAR_BS; else { ptr = oldptr; goto LONE_SINGLE_CHARACTER; /* A few lines below */ @@ -3641,35 +3891,23 @@ for (;; ptr++) } } - /* Loop until ']' reached. This "while" is the end of the "do" above. */ + /* Loop until ']' reached. This "while" is the end of the "do" far above. + If we are at the end of an internal nested string, revert to the outer + string. */ - while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq)); + while (((c = *(++ptr)) != 0 || + (nestptr != NULL && + (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) && + (c != CHAR_RIGHT_SQUARE_BRACKET || inescq)); - if (c == 0) /* Missing terminating ']' */ + /* Check for missing terminating ']' */ + + if (c == 0) { *errorcodeptr = ERR6; goto FAILED; } - -/* This code has been disabled because it would mean that \s counts as -an explicit \r or \n reference, and that's not really what is wanted. Now -we set the flag only if there is a literal "\r" or "\n" in the class. */ - -#if 0 - /* Remember whether \r or \n are in this class */ - - if (negate_class) - { - if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF; - } - else - { - if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF; - } -#endif - - /* If class_charcount is 1, we saw precisely one character whose value is less than 256. As long as there were no characters >= 128 and there was no use of \p or \P, in other words, no use of any XCLASS features, we can @@ -3733,13 +3971,14 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* If there are characters with values > 255, we have to compile an extended class, with its own opcode, unless there was a negated special - such as \S in the class, because in that case all characters > 255 are in - the class, so any that were explicitly given as well can be ignored. If - (when there are explicit characters > 255 that must be listed) there are no - characters < 256, we can omit the bitmap in the actual compiled code. */ + such as \S in the class, and PCRE_UCP is not set, because in that case all + characters > 255 are in the class, so any that were explicitly given as + well can be ignored. If (when there are explicit characters > 255 that must + be listed) there are no characters < 256, we can omit the bitmap in the + actual compiled code. */ #ifdef SUPPORT_UTF8 - if (class_utf8 && !should_flip_negation) + if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0)) { *class_utf8data++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; @@ -3765,10 +4004,11 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ } #endif - /* If there are no characters > 255, set the opcode to OP_CLASS or - OP_NCLASS, depending on whether the whole class was negated and whether - there were negative specials such as \S in the class. Then copy the 32-byte - map into the code vector, negating it if necessary. */ + /* If there are no characters > 255, or they are all to be included or + excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the + whole class was negated and whether there were negative specials such as \S + (non-UCP) in the class. Then copy the 32-byte map into the code vector, + negating it if necessary. */ *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; if (negate_class) @@ -3892,8 +4132,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1, - options, cd)) + check_auto_possessive(previous, utf8, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -3914,7 +4153,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ c = previous[1]; if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd)) + check_auto_possessive(previous, utf8, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -3938,7 +4177,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd)) + check_auto_possessive(previous, utf8, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -4148,7 +4387,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { register int i; int ketoffset = 0; - int len = code - previous; + int len = (int)(code - previous); uschar *bralink = NULL; /* Repeating a DEFINE group is pointless */ @@ -4169,7 +4408,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { register uschar *ket = previous; do ket += GET(ket, 1); while (*ket != OP_KET); - ketoffset = code - ket; + ketoffset = (int)(code - ket); } /* The case of a zero minimum is special because of the need to stick @@ -4237,7 +4476,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* We chain together the bracket offset fields that have to be filled in later when the ends of the brackets are reached. */ - offset = (bralink == NULL)? 0 : previous - bralink; + offset = (bralink == NULL)? 0 : (int)(previous - bralink); bralink = previous; PUTINC(previous, 0, offset); } @@ -4346,7 +4585,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { int offset; *code++ = OP_BRA; - offset = (bralink == NULL)? 0 : code - bralink; + offset = (bralink == NULL)? 0 : (int)(code - bralink); bralink = code; PUTINC(code, 0, offset); } @@ -4367,7 +4606,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ while (bralink != NULL) { int oldlinkoffset; - int offset = code - bralink + 1; + int offset = (int)(code - bralink + 1); uschar *bra = code - offset; oldlinkoffset = GET(bra, 1); bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; @@ -4455,7 +4694,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ #endif } - len = code - tempcode; + len = (int)(code - tempcode); if (len > 0) switch (*tempcode) { case OP_STAR: *tempcode = OP_POSSTAR; break; @@ -4514,24 +4753,34 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* First deal with various "verbs" that can be introduced by '*'. */ - if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0) + if (*(++ptr) == CHAR_ASTERISK && + ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':')) { int i, namelen; + int arglen = 0; const char *vn = verbnames; - const uschar *name = ++ptr; + const uschar *name = ptr + 1; + const uschar *arg = NULL; previous = NULL; while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; + namelen = (int)(ptr - name); + if (*ptr == CHAR_COLON) { - *errorcodeptr = ERR59; /* Not supported */ - goto FAILED; + arg = ++ptr; + while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0 + || *ptr == '_') ptr++; + arglen = (int)(ptr - arg); } + if (*ptr != CHAR_RIGHT_PARENTHESIS) { *errorcodeptr = ERR60; goto FAILED; } - namelen = ptr - name; + + /* Scan the table of verb names */ + for (i = 0; i < verbcount; i++) { if (namelen == verbs[i].len && @@ -4549,13 +4798,41 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ PUT2INC(code, 0, oc->number); } } - *code++ = verbs[i].op; - break; + + /* Handle the cases with/without an argument */ + + if (arglen == 0) + { + if (verbs[i].op < 0) /* Argument is mandatory */ + { + *errorcodeptr = ERR66; + goto FAILED; + } + *code++ = verbs[i].op; + } + + else + { + if (verbs[i].op_arg < 0) /* Argument is forbidden */ + { + *errorcodeptr = ERR59; + goto FAILED; + } + *code++ = verbs[i].op_arg; + *code++ = arglen; + memcpy(code, arg, arglen); + code += arglen; + *code++ = 0; + } + + break; /* Found verb, exit loop */ } + vn += verbs[i].len + 1; } - if (i < verbcount) continue; - *errorcodeptr = ERR60; + + if (i < verbcount) continue; /* Successfully handled a verb */ + *errorcodeptr = ERR60; /* Verb not recognized */ goto FAILED; } @@ -4674,7 +4951,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ recno * 10 + *ptr - CHAR_0 : -1; ptr++; } - namelen = ptr - name; + namelen = (int)(ptr - name); if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != CHAR_RIGHT_PARENTHESIS) @@ -4870,8 +5147,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ goto FAILED; } *code++ = n; - PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */ - PUT(code, LINK_SIZE, 0); /* Default length */ + PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */ + PUT(code, LINK_SIZE, 0); /* Default length */ code += 2 * LINK_SIZE; } previous = NULL; @@ -4904,7 +5181,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ name = ++ptr; while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; - namelen = ptr - name; + namelen = (int)(ptr - name); /* In the pre-compile phase, just do a syntax check. */ @@ -5034,7 +5311,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ NAMED_REF_OR_RECURSE: name = ++ptr; while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; - namelen = ptr - name; + namelen = (int)(ptr - name); /* In the pre-compile phase, do a syntax check and set a dummy reference number. */ @@ -5203,7 +5480,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ of the group. */ called = cd->start_code + recno; - PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code); + PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code)); } /* If not a forward reference, and the subpattern is still open, @@ -5227,7 +5504,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ code += 1 + LINK_SIZE; *code = OP_RECURSE; - PUT(code, 1, called - cd->start_code); + PUT(code, 1, (int)(called - cd->start_code)); code += 1 + LINK_SIZE; *code = OP_KET; @@ -5338,8 +5615,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ } /* End of switch for character following (? */ } /* End of (? handling */ - /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set, - all unadorned brackets become non-capturing and behave like (?:...) + /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE + is set, all unadorned brackets become non-capturing and behave like (?:...) brackets. */ else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) @@ -5531,11 +5808,12 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* ===================================================================*/ /* Handle metasequences introduced by \. For ones like \d, the ESC_ values - are arranged to be the negation of the corresponding OP_values. For the - back references, the values are ESC_REF plus the reference number. Only - back references and those types that consume a character may be repeated. - We can test for values between ESC_b and ESC_Z for the latter; this may - have to change if any new ones are ever created. */ + are arranged to be the negation of the corresponding OP_values in the + default case when PCRE_UCP is not set. For the back references, the values + are ESC_REF plus the reference number. Only back references and those types + that consume a character may be repeated. We can test for values between + ESC_b and ESC_Z for the latter; this may have to change if any new ones are + ever created. */ case CHAR_BACKSLASH: tempptr = ptr; @@ -5695,12 +5973,24 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ #endif /* For the rest (including \X when Unicode properties are supported), we - can obtain the OP value by negating the escape value. */ + can obtain the OP value by negating the escape value in the default + situation when PCRE_UCP is not set. When it *is* set, we substitute + Unicode property tests. */ else { - previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; - *code++ = -c; +#ifdef SUPPORT_UCP + if (-c >= ESC_DU && -c <= ESC_wu) + { + nestptr = ptr + 1; /* Where to resume */ + ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */ + } + else +#endif + { + previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; + *code++ = -c; + } } continue; } @@ -6032,7 +6322,7 @@ for (;;) { if (lengthptr == NULL) { - int branch_length = code - last_branch; + int branch_length = (int)(code - last_branch); do { int prev_length = GET(last_branch, 1); @@ -6046,7 +6336,7 @@ for (;;) /* Fill in the ket */ *code = OP_KET; - PUT(code, 1, code - start_bracket); + PUT(code, 1, (int)(code - start_bracket)); code += 1 + LINK_SIZE; /* If it was a capturing subpattern, check to see if it contained any @@ -6061,9 +6351,9 @@ for (;;) code - start_bracket); *start_bracket = OP_ONCE; code += 1 + LINK_SIZE; - PUT(start_bracket, 1, code - start_bracket); + PUT(start_bracket, 1, (int)(code - start_bracket)); *code = OP_KET; - PUT(code, 1, code - start_bracket); + PUT(code, 1, (int)(code - start_bracket)); code += 1 + LINK_SIZE; length += 2 + 2*LINK_SIZE; } @@ -6118,7 +6408,7 @@ for (;;) else { *code = OP_ALT; - PUT(code, 1, code - last_branch); + PUT(code, 1, (int)(code - last_branch)); bc.current_branch = last_branch = code; code += 1 + LINK_SIZE; } @@ -6437,7 +6727,7 @@ int length = 1; /* For final END opcode */ int firstbyte, reqbyte, newline; int errorcode = 0; int skipatstart = 0; -BOOL utf8 = (options & PCRE_UTF8) != 0; +BOOL utf8; size_t size; uschar *code; const uschar *codestart; @@ -6507,6 +6797,8 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0) { skipatstart += 7; options |= PCRE_UTF8; continue; } + else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0) + { skipatstart += 6; options |= PCRE_UCP; continue; } if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0) { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } @@ -6531,6 +6823,8 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && else break; } +utf8 = (options & PCRE_UTF8) != 0; + /* Can't support UTF8 unless PCRE has been compiled to include the code. */ #ifdef SUPPORT_UTF8 @@ -6548,6 +6842,16 @@ if (utf8) } #endif +/* Can't support UCP unless PCRE has been compiled to include the code. */ + +#ifndef SUPPORT_UCP +if ((options & PCRE_UCP) != 0) + { + errorcode = ERR67; + goto PCRE_EARLY_ERROR_RETURN; + } +#endif + /* Check validity of \R options. */ switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) @@ -6676,7 +6980,7 @@ regex compiled on a system with 4-byte pointers is run on another with 8-byte pointers. */ re->magic_number = MAGIC_NUMBER; -re->size = size; +re->size = (int)size; re->options = cd->external_options; re->flags = cd->external_flags; re->dummy1 = 0; @@ -6747,7 +7051,7 @@ while (errorcode == 0 && cd->hwm > cworkspace) recno = GET(codestart, offset); groupptr = _pcre_find_bracket(codestart, utf8, recno); if (groupptr == NULL) errorcode = ERR53; - else PUT(((uschar *)codestart), offset, groupptr - codestart); + else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart)); } /* Give an error if there's back reference to a non-existent capturing @@ -6802,7 +7106,7 @@ if (errorcode != 0) { (pcre_free)(re); PCRE_EARLY_ERROR_RETURN: - *erroroffset = ptr - (const uschar *)pattern; + *erroroffset = (int)(ptr - (const uschar *)pattern); PCRE_EARLY_ERROR_RETURN2: *errorptr = find_error_text(errorcode); if (errorcodeptr != NULL) *errorcodeptr = errorcode; diff --git a/harbour/external/pcre/pcredfa.c b/harbour/external/pcre/pcredfa.c index a806b00ed6..3fdb0246e2 100644 --- a/harbour/external/pcre/pcredfa.c +++ b/harbour/external/pcre/pcredfa.c @@ -106,7 +106,7 @@ never stored, so we push them well clear of the normal opcodes. */ /* This table identifies those opcodes that are followed immediately by a -character that is to be tested in some way. This makes is possible to +character that is to be tested in some way. This makes it possible to centralize the loading of these characters. In the case of Type * etc, the "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a small value. Non-zero values in the table are the offsets from the opcode where @@ -161,8 +161,9 @@ static const uschar coptable[] = { 0, 0, /* RREF, NRREF */ 0, /* DEF */ 0, 0, /* BRAZERO, BRAMINZERO */ - 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ - 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ + 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */ + 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */ + 0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */ }; /* This table identifies those opcodes that inspect a character. It is used to @@ -218,8 +219,9 @@ static const uschar poptable[] = { 0, 0, /* RREF, NRREF */ 0, /* DEF */ 0, 0, /* BRAZERO, BRAMINZERO */ - 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ - 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ + 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */ + 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */ + 0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */ }; /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, @@ -473,7 +475,7 @@ if (*first_op == OP_REVERSE) { gone_back = (current_subject - max_back < start_subject)? - current_subject - start_subject : max_back; + (int)(current_subject - start_subject) : max_back; current_subject -= gone_back; } @@ -490,7 +492,7 @@ if (*first_op == OP_REVERSE) int back = GET(end_code, 2+LINK_SIZE); if (back <= gone_back) { - int bstate = end_code - start_code + 2 + 2*LINK_SIZE; + int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE); ADD_NEW_DATA(-bstate, 0, gone_back - back); } end_code += GET(end_code, 1); @@ -526,7 +528,7 @@ else ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); do { - ADD_NEW(end_code - start_code + length, 0); + ADD_NEW((int)(end_code - start_code + length), 0); end_code += GET(end_code, 1); length = 1 + LINK_SIZE; } @@ -753,8 +755,8 @@ for (;;) if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); if (offsetcount >= 2) { - offsets[0] = current_subject - start_subject; - offsets[1] = ptr - start_subject; + offsets[0] = (int)(current_subject - start_subject); + offsets[1] = (int)(ptr - start_subject); DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, offsets[1] - offsets[0], current_subject)); } @@ -776,7 +778,7 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_ALT: do { code += GET(code, 1); } while (*code == OP_ALT); - ADD_ACTIVE(code - start_code, 0); + ADD_ACTIVE((int)(code - start_code), 0); break; /*-----------------------------------------------------------------*/ @@ -784,7 +786,7 @@ for (;;) case OP_SBRA: do { - ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); code += GET(code, 1); } while (*code == OP_ALT); @@ -793,11 +795,11 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_CBRA: case OP_SCBRA: - ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0); + ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0); code += GET(code, 1); while (*code == OP_ALT) { - ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); code += GET(code, 1); } break; @@ -808,14 +810,14 @@ for (;;) ADD_ACTIVE(state_offset + 1, 0); code += 1 + GET(code, 2); while (*code == OP_ALT) code += GET(code, 1); - ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); break; /*-----------------------------------------------------------------*/ case OP_SKIPZERO: code += 1 + GET(code, 2); while (*code == OP_ALT) code += GET(code, 1); - ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); break; /*-----------------------------------------------------------------*/ @@ -920,13 +922,37 @@ for (;;) if (utf8) BACKCHAR(temp); #endif GETCHARTEST(d, temp); +#ifdef SUPPORT_UCP + if ((md->poptions & PCRE_UCP) != 0) + { + if (d == '_') left_word = TRUE; else + { + int cat = UCD_CATEGORY(d); + left_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif left_word = d < 256 && (ctypes[d] & ctype_word) != 0; } - else left_word = 0; + else left_word = FALSE; if (clen > 0) + { +#ifdef SUPPORT_UCP + if ((md->poptions & PCRE_UCP) != 0) + { + if (c == '_') right_word = TRUE; else + { + int cat = UCD_CATEGORY(c); + right_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif right_word = c < 256 && (ctypes[c] & ctype_word) != 0; - else right_word = 0; + } + else right_word = FALSE; if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) { ADD_ACTIVE(state_offset + 1, 0); } @@ -953,7 +979,8 @@ for (;;) break; case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; + OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt; break; case PT_GC: @@ -968,6 +995,30 @@ for (;;) OK = prop->script == code[2]; break; + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ + OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ + OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: + OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1122,7 +1173,8 @@ for (;;) break; case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; + OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt; break; case PT_GC: @@ -1137,6 +1189,30 @@ for (;;) OK = prop->script == code[3]; break; + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ + OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ + OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: + OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1344,7 +1420,8 @@ for (;;) break; case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; + OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt; break; case PT_GC: @@ -1359,6 +1436,30 @@ for (;;) OK = prop->script == code[3]; break; + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ + OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ + OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: + OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1591,7 +1692,8 @@ for (;;) break; case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; + OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt; break; case PT_GC: @@ -1606,6 +1708,30 @@ for (;;) OK = prop->script == code[5]; break; + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ + OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ + OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: + OK = _pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -2233,7 +2359,7 @@ for (;;) points to the byte after the end of the class. If there is a quantifier, this is where it will be. */ - next_state_offset = ecode - start_code; + next_state_offset = (int)(ecode - start_code); switch (*ecode) { @@ -2304,7 +2430,7 @@ for (;;) md, /* static match data */ code, /* this subexpression's code */ ptr, /* where we currently are */ - ptr - start_subject, /* start offset */ + (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ @@ -2315,7 +2441,7 @@ for (;;) if (rc == PCRE_ERROR_DFA_UITEM) return rc; if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) - { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } + { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } } break; @@ -2342,9 +2468,9 @@ for (;;) cb.callout_number = code[LINK_SIZE+2]; cb.offset_vector = offsets; cb.subject = (PCRE_SPTR)start_subject; - cb.subject_length = end_subject - start_subject; - cb.start_match = current_subject - start_subject; - cb.current_position = ptr - start_subject; + cb.subject_length = (int)(end_subject - start_subject); + cb.start_match = (int)(current_subject - start_subject); + cb.current_position = (int)(ptr - start_subject); cb.pattern_position = GET(code, LINK_SIZE + 3); cb.next_item_length = GET(code, 3 + 2*LINK_SIZE); cb.capture_top = 1; @@ -2395,7 +2521,7 @@ for (;;) md, /* fixed match data */ asscode, /* this subexpression's code */ ptr, /* where we currently are */ - ptr - start_subject, /* start offset */ + (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ @@ -2407,7 +2533,7 @@ for (;;) if (rc == PCRE_ERROR_DFA_UITEM) return rc; if ((rc >= 0) == (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) - { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } + { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } } @@ -2428,7 +2554,7 @@ for (;;) md, /* fixed match data */ start_code + GET(code, 1), /* this subexpression's code */ ptr, /* where we currently are */ - ptr - start_subject, /* start offset */ + (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ @@ -2480,7 +2606,7 @@ for (;;) md, /* fixed match data */ code, /* this subexpression's code */ ptr, /* where we currently are */ - ptr - start_subject, /* start offset */ + (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ @@ -2497,7 +2623,8 @@ for (;;) do { end_subpattern += GET(end_subpattern, 1); } while (*end_subpattern == OP_ALT); - next_state_offset = end_subpattern - start_code + LINK_SIZE + 1; + next_state_offset = + (int)(end_subpattern - start_code + LINK_SIZE + 1); /* If the end of this subpattern is KETRMAX or KETRMIN, we must arrange for the repeat state also to be added to the relevant list. @@ -2505,7 +2632,7 @@ for (;;) repeat_state_offset = (*end_subpattern == OP_KETRMAX || *end_subpattern == OP_KETRMIN)? - end_subpattern - start_code - GET(end_subpattern, 1) : -1; + (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1; /* If we have matched an empty string, add the next state at the current character pointer. This is important so that the duplicate @@ -2569,9 +2696,9 @@ for (;;) cb.callout_number = code[1]; cb.offset_vector = offsets; cb.subject = (PCRE_SPTR)start_subject; - cb.subject_length = end_subject - start_subject; - cb.start_match = current_subject - start_subject; - cb.current_position = ptr - start_subject; + cb.subject_length = (int)(end_subject - start_subject); + cb.start_match = (int)(current_subject - start_subject); + cb.current_position = (int)(ptr - start_subject); cb.pattern_position = GET(code, 2); cb.next_item_length = GET(code, 2 + LINK_SIZE); cb.capture_top = 1; @@ -2622,8 +2749,8 @@ for (;;) { if (offsetcount >= 2) { - offsets[0] = md->start_used_ptr - start_subject; - offsets[1] = end_subject - start_subject; + offsets[0] = (int)(md->start_used_ptr - start_subject); + offsets[1] = (int)(end_subject - start_subject); } match_count = PCRE_ERROR_PARTIAL; } @@ -2982,8 +3109,16 @@ for (;;) while (current_subject < end_subject) { register unsigned int c = *current_subject; - if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++; - else break; + if ((start_bits[c/8] & (1 << (c&7))) == 0) + { + current_subject++; +#ifdef SUPPORT_UTF8 + if (utf8) + while(current_subject < end_subject && + (*current_subject & 0xc0) == 0x80) current_subject++; +#endif + } + else break; } } } diff --git a/harbour/external/pcre/pcreexec.c b/harbour/external/pcre/pcreexec.c index e58386ece4..14299d0359 100644 --- a/harbour/external/pcre/pcreexec.c +++ b/harbour/external/pcre/pcreexec.c @@ -71,10 +71,20 @@ defined PCRE_ERROR_xxx codes, which are all negative. */ /* Special internal returns from the match() function. Make them sufficiently negative to avoid the external error codes. */ -#define MATCH_COMMIT (-999) -#define MATCH_PRUNE (-998) -#define MATCH_SKIP (-997) -#define MATCH_THEN (-996) +#define MATCH_ACCEPT (-999) +#define MATCH_COMMIT (-998) +#define MATCH_PRUNE (-997) +#define MATCH_SKIP (-996) +#define MATCH_SKIP_ARG (-995) +#define MATCH_THEN (-994) + +/* This is a convenience macro for code that occurs many times. */ + +#define MRRETURN(ra) \ + { \ + md->mark = markptr; \ + RRETURN(ra); \ + } /* Maximum number of ints of offset to save on the stack for recursive calls. If the offset vector is bigger, malloc is used. This should be a multiple of 3, @@ -245,7 +255,8 @@ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, - RM51, RM52, RM53, RM54 }; + RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, + RM61, RM62 }; /* These versions of the macros use the stack, as normal. There are debugging versions and production versions. Note that the "rw" argument of RMATCH isn't @@ -284,6 +295,7 @@ argument of match(), which never changes. */ #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\ {\ heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\ + if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\ frame->Xwhere = rw; \ newframe->Xeptr = ra;\ newframe->Xecode = rb;\ @@ -304,9 +316,9 @@ argument of match(), which never changes. */ #define RRETURN(ra)\ {\ - heapframe *newframe = frame;\ - frame = newframe->Xprevframe;\ - (pcre_stack_free)(newframe);\ + heapframe *oldframe = frame;\ + frame = oldframe->Xprevframe;\ + (pcre_stack_free)(oldframe);\ if (frame != NULL)\ {\ rrc = ra;\ @@ -413,14 +425,14 @@ the subject. */ if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\ {\ md->hitend = TRUE;\ - if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\ + if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\ } #define SCHECK_PARTIAL()\ if (md->partial != 0 && eptr > mstart)\ {\ md->hitend = TRUE;\ - if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\ + if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\ } @@ -448,13 +460,14 @@ Arguments: Returns: MATCH_MATCH if matched ) these values are >= 0 MATCH_NOMATCH if failed to match ) + a negative MATCH_xxx value for PRUNE, SKIP, etc a negative PCRE_ERROR_xxx value if aborted by an error condition (e.g. stopped by repeated call or recursion limit) */ static int -match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, USPTR - markptr, int offset_top, match_data *md, unsigned long int ims, +match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, + const uschar *markptr, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags, unsigned int rdepth) { /* These variables do not need to be preserved over recursion in this function, @@ -476,6 +489,7 @@ heap whenever RMATCH() does a "recursion". See the macro definitions above. */ #ifdef NO_RECURSE heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe)); +if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY); frame->Xprevframe = NULL; /* Marks the top level */ /* Copy in the original argument variables */ @@ -671,32 +685,81 @@ for (;;) switch(op) { - case OP_FAIL: - RRETURN(MATCH_NOMATCH); + case OP_MARK: + markptr = ecode + 2; + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, + ims, eptrb, flags, RM55); - case OP_PRUNE: - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, - ims, eptrb, flags, RM51); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - RRETURN(MATCH_PRUNE); + /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an + argument, and we must check whether that argument matches this MARK's + argument. It is passed back in md->start_match_ptr (an overloading of that + variable). If it does match, we reset that variable to the current subject + position and return MATCH_SKIP. Otherwise, pass back the return code + unaltered. */ + + if (rrc == MATCH_SKIP_ARG && + strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0) + { + md->start_match_ptr = eptr; + RRETURN(MATCH_SKIP); + } + + if (md->mark == NULL) md->mark = markptr; + RRETURN(rrc); + + case OP_FAIL: + MRRETURN(MATCH_NOMATCH); case OP_COMMIT: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM52); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - RRETURN(MATCH_COMMIT); + MRRETURN(MATCH_COMMIT); + + case OP_PRUNE: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags, RM51); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + MRRETURN(MATCH_PRUNE); + + case OP_PRUNE_ARG: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, + ims, eptrb, flags, RM56); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + md->mark = ecode + 2; + RRETURN(MATCH_PRUNE); case OP_SKIP: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM53); if (rrc != MATCH_NOMATCH) RRETURN(rrc); md->start_match_ptr = eptr; /* Pass back current position */ - RRETURN(MATCH_SKIP); + MRRETURN(MATCH_SKIP); + + case OP_SKIP_ARG: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, + ims, eptrb, flags, RM57); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + + /* Pass back the current skip name by overloading md->start_match_ptr and + returning the special MATCH_SKIP_ARG return code. This will either be + caught by a matching MARK, or get to the top, where it is treated the same + as PRUNE. */ + + md->start_match_ptr = ecode + 2; + RRETURN(MATCH_SKIP_ARG); case OP_THEN: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM54); if (rrc != MATCH_NOMATCH) RRETURN(rrc); + MRRETURN(MATCH_THEN); + + case OP_THEN_ARG: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, + ims, eptrb, flags, RM58); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + md->mark = ecode + 2; RRETURN(MATCH_THEN); /* Handle a capturing bracket. If there is space in the offset vector, save @@ -733,7 +796,8 @@ for (;;) save_capture_last = md->capture_last; DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); - md->offset_vector[md->offset_end - number] = eptr - md->start_subject; + md->offset_vector[md->offset_end - number] = + (int)(eptr - md->start_subject); flags = (op == OP_SCBRA)? match_cbegroup : 0; do @@ -752,6 +816,7 @@ for (;;) md->offset_vector[offset+1] = save_offset2; md->offset_vector[md->offset_end - number] = save_offset3; + if (rrc != MATCH_THEN) md->mark = markptr; RRETURN(MATCH_NOMATCH); } @@ -791,6 +856,7 @@ for (;;) RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM48); + if (rrc == MATCH_NOMATCH) md->mark = markptr; RRETURN(rrc); } @@ -826,15 +892,15 @@ for (;;) cb.callout_number = ecode[LINK_SIZE+2]; cb.offset_vector = md->offset_vector; cb.subject = (PCRE_SPTR)md->start_subject; - cb.subject_length = md->end_subject - md->start_subject; - cb.start_match = mstart - md->start_subject; - cb.current_position = eptr - md->start_subject; + cb.subject_length = (int)(md->end_subject - md->start_subject); + cb.start_match = (int)(mstart - md->start_subject); + cb.current_position = (int)(eptr - md->start_subject); cb.pattern_position = GET(ecode, LINK_SIZE + 3); cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); cb.capture_top = offset_top/2; cb.capture_last = md->capture_last; cb.callout_data = md->callout_data; - if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); + if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } ecode += _pcre_OP_lengths[OP_CALLOUT]; @@ -1054,7 +1120,7 @@ for (;;) { md->offset_vector[offset] = md->offset_vector[md->offset_end - number]; - md->offset_vector[offset+1] = eptr - md->start_subject; + md->offset_vector[offset+1] = (int)(eptr - md->start_subject); if (offset_top <= offset) offset_top = offset + 2; } ecode += 3; @@ -1089,14 +1155,19 @@ for (;;) (md->notempty || (md->notempty_atstart && mstart == md->start_subject + md->start_offset))) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); /* Otherwise, we have a match. */ md->end_match_ptr = eptr; /* Record where we ended */ md->end_offset_top = offset_top; /* and how many extracts were taken */ md->start_match_ptr = mstart; /* and the start (\K can modify) */ - RRETURN(MATCH_MATCH); + + /* For some reason, the macros don't work properly if an expression is + given as the argument to MRRETURN when the heap is in use. */ + + rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT; + MRRETURN(rrc); /* Change option settings */ @@ -1118,7 +1189,7 @@ for (;;) { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, RM4); - if (rrc == MATCH_MATCH) + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) { mstart = md->start_match_ptr; /* In case \K reset it */ break; @@ -1127,7 +1198,7 @@ for (;;) ecode += GET(ecode, 1); } while (*ecode == OP_ALT); - if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); + if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH); /* If checking an assertion for a condition, return MATCH_MATCH. */ @@ -1151,7 +1222,7 @@ for (;;) { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, RM5); - if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH); if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT) { do ecode += GET(ecode,1); while (*ecode == OP_ALT); @@ -1180,7 +1251,7 @@ for (;;) while (i-- > 0) { eptr--; - if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); + if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH); BACKCHAR(eptr); } } @@ -1191,7 +1262,7 @@ for (;;) { eptr -= GET(ecode, 1); - if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); + if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH); } /* Save the earliest consulted character, then skip to next op code */ @@ -1212,15 +1283,15 @@ for (;;) cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; cb.subject = (PCRE_SPTR)md->start_subject; - cb.subject_length = md->end_subject - md->start_subject; - cb.start_match = mstart - md->start_subject; - cb.current_position = eptr - md->start_subject; + cb.subject_length = (int)(md->end_subject - md->start_subject); + cb.start_match = (int)(mstart - md->start_subject); + cb.current_position = (int)(eptr - md->start_subject); cb.pattern_position = GET(ecode, 2); cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; cb.capture_last = md->capture_last; cb.callout_data = md->callout_data; - if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); + if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } ecode += 2 + 2*LINK_SIZE; @@ -1286,13 +1357,13 @@ for (;;) { RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, md, ims, eptrb, flags, RM6); - if (rrc == MATCH_MATCH) + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) { DPRINTF(("Recursion matched\n")); md->recursive = new_recursive.prevrec; if (new_recursive.offset_save != stacksave) (pcre_free)(new_recursive.offset_save); - RRETURN(MATCH_MATCH); + MRRETURN(MATCH_MATCH); } else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { @@ -1313,7 +1384,7 @@ for (;;) md->recursive = new_recursive.prevrec; if (new_recursive.offset_save != stacksave) (pcre_free)(new_recursive.offset_save); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never reaches here */ @@ -1332,7 +1403,7 @@ for (;;) do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7); - if (rrc == MATCH_MATCH) + if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */ { mstart = md->start_match_ptr; break; @@ -1467,7 +1538,7 @@ for (;;) md->end_match_ptr = eptr; /* For ONCE */ md->end_offset_top = offset_top; md->start_match_ptr = mstart; - RRETURN(MATCH_MATCH); + MRRETURN(MATCH_MATCH); } /* For capturing groups we have to check the group number back at the start @@ -1491,7 +1562,7 @@ for (;;) { md->offset_vector[offset] = md->offset_vector[md->offset_end - number]; - md->offset_vector[offset+1] = eptr - md->start_subject; + md->offset_vector[offset+1] = (int)(eptr - md->start_subject); if (offset_top <= offset) offset_top = offset + 2; } @@ -1562,12 +1633,12 @@ for (;;) /* Start of subject unless notbol, or after internal newline if multiline */ case OP_CIRC: - if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); + if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH); if ((ims & PCRE_MULTILINE) != 0) { if (eptr != md->start_subject && (eptr == md->end_subject || !WAS_NEWLINE(eptr))) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; } @@ -1576,14 +1647,14 @@ for (;;) /* Start of subject assertion */ case OP_SOD: - if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); + if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH); ecode++; break; /* Start of match assertion */ case OP_SOM: - if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); + if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1601,20 +1672,20 @@ for (;;) if ((ims & PCRE_MULTILINE) != 0) { if (eptr < md->end_subject) - { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } + { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); } else - { if (md->noteol) RRETURN(MATCH_NOMATCH); } + { if (md->noteol) MRRETURN(MATCH_NOMATCH); } ecode++; break; } else { - if (md->noteol) RRETURN(MATCH_NOMATCH); + if (md->noteol) MRRETURN(MATCH_NOMATCH); if (!md->endonly) { if (eptr != md->end_subject && (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; } @@ -1624,7 +1695,7 @@ for (;;) /* End of subject assertion (\z) */ case OP_EOD: - if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1633,7 +1704,7 @@ for (;;) case OP_EODN: if (eptr != md->end_subject && (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1651,14 +1722,30 @@ for (;;) #ifdef SUPPORT_UTF8 if (utf8) { + /* Get status of previous character */ + if (eptr == md->start_subject) prev_is_word = FALSE; else { USPTR lastptr = eptr - 1; while((*lastptr & 0xc0) == 0x80) lastptr--; if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; GETCHAR(c, lastptr); +#ifdef SUPPORT_UCP + if (md->use_ucp) + { + if (c == '_') prev_is_word = TRUE; else + { + int cat = UCD_CATEGORY(c); + prev_is_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; } + + /* Get status of next character */ + if (eptr >= md->end_subject) { SCHECK_PARTIAL(); @@ -1667,47 +1754,89 @@ for (;;) else { GETCHAR(c, eptr); +#ifdef SUPPORT_UCP + if (md->use_ucp) + { + if (c == '_') cur_is_word = TRUE; else + { + int cat = UCD_CATEGORY(c); + cur_is_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; } } else #endif - /* Not in UTF-8 mode */ + /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for + consistency with the behaviour of \w we do use it in this case. */ { + /* Get status of previous character */ + if (eptr == md->start_subject) prev_is_word = FALSE; else { if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1; +#ifdef SUPPORT_UCP + if (md->use_ucp) + { + c = eptr[-1]; + if (c == '_') prev_is_word = TRUE; else + { + int cat = UCD_CATEGORY(c); + prev_is_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0); } + + /* Get status of next character */ + if (eptr >= md->end_subject) { SCHECK_PARTIAL(); cur_is_word = FALSE; } - else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0); + else +#ifdef SUPPORT_UCP + if (md->use_ucp) + { + c = *eptr; + if (c == '_') cur_is_word = TRUE; else + { + int cat = UCD_CATEGORY(c); + cur_is_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif + cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0); } /* Now see if the situation is what we want */ if ((*ecode++ == OP_WORD_BOUNDARY)? cur_is_word == prev_is_word : cur_is_word != prev_is_word) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; /* Match a single character type; inline for speed */ case OP_ANY: - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); /* Fall through */ case OP_ALLANY: if (eptr++ >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; ecode++; @@ -1720,7 +1849,7 @@ for (;;) if (eptr++ >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } ecode++; break; @@ -1729,7 +1858,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1738,7 +1867,7 @@ for (;;) #endif (md->ctypes[c] & ctype_digit) != 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1746,7 +1875,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1755,7 +1884,7 @@ for (;;) #endif (md->ctypes[c] & ctype_digit) == 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1763,7 +1892,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1772,7 +1901,7 @@ for (;;) #endif (md->ctypes[c] & ctype_space) != 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1780,7 +1909,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1789,7 +1918,7 @@ for (;;) #endif (md->ctypes[c] & ctype_space) == 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1797,7 +1926,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1806,7 +1935,7 @@ for (;;) #endif (md->ctypes[c] & ctype_word) != 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1814,7 +1943,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1823,7 +1952,7 @@ for (;;) #endif (md->ctypes[c] & ctype_word) == 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1831,12 +1960,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; @@ -1849,7 +1978,7 @@ for (;;) case 0x0085: case 0x2028: case 0x2029: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); + if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); break; } ecode++; @@ -1859,7 +1988,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); switch(c) @@ -1884,7 +2013,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } ecode++; break; @@ -1893,12 +2022,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ @@ -1927,7 +2056,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); switch(c) @@ -1940,7 +2069,7 @@ for (;;) case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } ecode++; break; @@ -1949,12 +2078,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ @@ -1976,7 +2105,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); { @@ -1985,31 +2114,63 @@ for (;;) switch(ecode[1]) { case PT_ANY: - if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); + if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH); break; case PT_LAMP: if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; + MRRETURN(MATCH_NOMATCH); + break; case PT_GC: if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case PT_PC: if ((ecode[2] != prop->chartype) == (op == OP_PROP)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case PT_SC: if ((ecode[2] != prop->script) == (op == OP_PROP)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; + /* These are specials */ + + case PT_ALNUM: + if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) + MRRETURN(MATCH_NOMATCH); + break; + + case PT_SPACE: /* Perl space */ + if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) + == (op == OP_NOTPROP)) + MRRETURN(MATCH_NOMATCH); + break; + + case PT_PXSPACE: /* POSIX space */ + if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR) + == (op == OP_NOTPROP)) + MRRETURN(MATCH_NOMATCH); + break; + + case PT_WORD: + if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N || + c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) + MRRETURN(MATCH_NOMATCH); + break; + + /* This should never occur */ + default: RRETURN(PCRE_ERROR_INTERNAL); } @@ -2025,12 +2186,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); { int category = UCD_CATEGORY(c); - if (category == ucp_M) RRETURN(MATCH_NOMATCH); + if (category == ucp_M) MRRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; @@ -2075,7 +2236,7 @@ for (;;) referenced subpattern. */ if (offset >= offset_top || md->offset_vector[offset] < 0) - length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1; + length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1); else length = md->offset_vector[offset+1] - md->offset_vector[offset]; @@ -2109,7 +2270,7 @@ for (;;) if (!match_ref(offset, eptr, length, md, ims)) { CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr += length; continue; /* With the main loop */ @@ -2129,7 +2290,7 @@ for (;;) if (!match_ref(offset, eptr, length, md, ims)) { CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr += length; } @@ -2147,11 +2308,11 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (!match_ref(offset, eptr, length, md, ims)) { CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr += length; } @@ -2178,7 +2339,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr -= length; } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2240,16 +2401,16 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); if (c > 255) { - if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); + if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH); } else { - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } } @@ -2262,10 +2423,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } c = *eptr++; - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } @@ -2287,20 +2448,20 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); if (c > 255) { - if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); + if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH); } else { - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } } @@ -2312,14 +2473,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } c = *eptr++; - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2385,7 +2546,7 @@ for (;;) } } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2437,10 +2598,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); + if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH); } /* If max == min we can continue with the main loop without the @@ -2457,14 +2618,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); + if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -2493,7 +2654,7 @@ for (;;) if (eptr-- == pp) break; /* Stop if tried at original pos */ if (utf8) BACKCHAR(eptr); } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -2512,9 +2673,9 @@ for (;;) if (length > md->end_subject - eptr) { CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); + while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH); } else #endif @@ -2524,9 +2685,9 @@ for (;;) if (md->end_subject - eptr < 1) { SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); + if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH); ecode += 2; } break; @@ -2544,7 +2705,7 @@ for (;;) if (length > md->end_subject - eptr) { CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* If the pattern character's value is < 128, we have only one byte, and @@ -2552,7 +2713,7 @@ for (;;) if (fc < 128) { - if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); } /* Otherwise we must pick up the subject character */ @@ -2571,7 +2732,7 @@ for (;;) #ifdef SUPPORT_UCP if (dc != UCD_OTHERCASE(fc)) #endif - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } } @@ -2583,9 +2744,9 @@ for (;;) if (md->end_subject - eptr < 1) { SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); ecode += 2; } break; @@ -2679,7 +2840,7 @@ for (;;) else { CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } @@ -2691,7 +2852,7 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr <= md->end_subject - length && memcmp(eptr, charptr, length) == 0) eptr += length; #ifdef SUPPORT_UCP @@ -2702,7 +2863,7 @@ for (;;) else { CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2733,7 +2894,7 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr == pp) { RRETURN(MATCH_NOMATCH); } + if (eptr == pp) { MRRETURN(MATCH_NOMATCH); } #ifdef SUPPORT_UCP eptr--; BACKCHAR(eptr); @@ -2776,9 +2937,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); } if (min == max) continue; if (minimize) @@ -2787,13 +2948,13 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -2819,7 +2980,7 @@ for (;;) eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -2833,9 +2994,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc != *eptr++) RRETURN(MATCH_NOMATCH); + if (fc != *eptr++) MRRETURN(MATCH_NOMATCH); } if (min == max) continue; @@ -2846,13 +3007,13 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc != *eptr++) RRETURN(MATCH_NOMATCH); + if (fc != *eptr++) MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -2877,7 +3038,7 @@ for (;;) eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2889,7 +3050,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } ecode++; GETCHARINCTEST(c, eptr); @@ -2899,11 +3060,11 @@ for (;;) if (c < 256) #endif c = md->lcc[c]; - if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH); + if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH); } else { - if (*ecode++ == c) RRETURN(MATCH_NOMATCH); + if (*ecode++ == c) MRRETURN(MATCH_NOMATCH); } break; @@ -2997,11 +3158,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; - if (fc == d) RRETURN(MATCH_NOMATCH); + if (fc == d) MRRETURN(MATCH_NOMATCH); } } else @@ -3014,9 +3175,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); } } @@ -3033,15 +3194,15 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; - if (fc == d) RRETURN(MATCH_NOMATCH); + if (fc == d) MRRETURN(MATCH_NOMATCH); } } else @@ -3052,13 +3213,13 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -3120,7 +3281,7 @@ for (;;) } } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -3139,10 +3300,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(d, eptr); - if (fc == d) RRETURN(MATCH_NOMATCH); + if (fc == d) MRRETURN(MATCH_NOMATCH); } } else @@ -3154,9 +3315,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc == *eptr++) RRETURN(MATCH_NOMATCH); + if (fc == *eptr++) MRRETURN(MATCH_NOMATCH); } } @@ -3173,14 +3334,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(d, eptr); - if (fc == d) RRETURN(MATCH_NOMATCH); + if (fc == d) MRRETURN(MATCH_NOMATCH); } } else @@ -3191,13 +3352,13 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc == *eptr++) RRETURN(MATCH_NOMATCH); + if (fc == *eptr++) MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -3258,7 +3419,7 @@ for (;;) } } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -3352,13 +3513,13 @@ for (;;) switch(prop_type) { case PT_ANY: - if (prop_fail_result) RRETURN(MATCH_NOMATCH); + if (prop_fail_result) MRRETURN(MATCH_NOMATCH); for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); } @@ -3370,14 +3531,14 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3387,12 +3548,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3402,12 +3563,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3417,15 +3578,84 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; + case PT_ALNUM: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || prop_category == ucp_N) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + break; + + case PT_SPACE: /* Perl space */ + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + break; + + case PT_PXSPACE: /* POSIX space */ + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + break; + + case PT_WORD: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || prop_category == ucp_N || + c == CHAR_UNDERSCORE) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + break; + + /* This should not occur */ + default: RRETURN(PCRE_ERROR_INTERNAL); } @@ -3441,11 +3671,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); - if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); + if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; @@ -3472,9 +3702,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } @@ -3486,7 +3716,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; @@ -3494,7 +3724,7 @@ for (;;) break; case OP_ANYBYTE: - if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH); + if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH); eptr += min; break; @@ -3504,12 +3734,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; @@ -3522,7 +3752,7 @@ for (;;) case 0x0085: case 0x2028: case 0x2029: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); + if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); break; } } @@ -3534,7 +3764,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); switch(c) @@ -3559,7 +3789,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } break; @@ -3570,12 +3800,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ @@ -3606,7 +3836,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); switch(c) @@ -3619,7 +3849,7 @@ for (;;) case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } break; @@ -3630,12 +3860,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ @@ -3654,11 +3884,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3668,10 +3898,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } break; @@ -3682,10 +3912,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); } break; @@ -3696,10 +3926,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } break; @@ -3710,10 +3940,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); } break; @@ -3724,10 +3954,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } break; @@ -3750,9 +3980,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); eptr++; } break; @@ -3761,7 +3991,7 @@ for (;;) if (eptr > md->end_subject - min) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr += min; break; @@ -3770,7 +4000,7 @@ for (;;) if (eptr > md->end_subject - min) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr += min; break; @@ -3781,11 +4011,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } switch(*eptr++) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; @@ -3795,7 +4025,7 @@ for (;;) case 0x000b: case 0x000c: case 0x0085: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); + if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); break; } } @@ -3807,7 +4037,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } switch(*eptr++) { @@ -3815,7 +4045,7 @@ for (;;) case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } break; @@ -3826,11 +4056,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } switch(*eptr++) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ @@ -3845,7 +4075,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } switch(*eptr++) { @@ -3855,7 +4085,7 @@ for (;;) case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } break; @@ -3866,11 +4096,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } switch(*eptr++) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ @@ -3887,9 +4117,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH); } break; @@ -3899,9 +4129,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH); } break; @@ -3911,9 +4141,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH); } break; @@ -3923,9 +4153,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH); } break; @@ -3935,10 +4165,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if ((md->ctypes[*eptr++] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3948,10 +4178,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if ((md->ctypes[*eptr++] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3980,14 +4210,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - GETCHARINC(c, eptr); - if (prop_fail_result) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if (prop_fail_result) MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -3996,18 +4226,18 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -4016,16 +4246,16 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -4034,16 +4264,16 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -4052,19 +4282,101 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ + case PT_ALNUM: + for (fi = min;; fi++) + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) MRRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || prop_category == ucp_N) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_SPACE: /* Perl space */ + for (fi = min;; fi++) + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) MRRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_PXSPACE: /* POSIX space */ + for (fi = min;; fi++) + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) MRRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_WORD: + for (fi = min;; fi++) + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) MRRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || + prop_category == ucp_N || + c == CHAR_UNDERSCORE) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + /* This should never occur */ + default: RRETURN(PCRE_ERROR_INTERNAL); } @@ -4079,15 +4391,15 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); - if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); + if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; @@ -4111,14 +4423,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (ctype == OP_ANY && IS_NEWLINE(eptr)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(ctype) { @@ -4130,7 +4442,7 @@ for (;;) case OP_ANYNL: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; @@ -4142,7 +4454,7 @@ for (;;) case 0x0085: case 0x2028: case 0x2029: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); + if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); break; } break; @@ -4170,14 +4482,14 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; case OP_HSPACE: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ @@ -4212,14 +4524,14 @@ for (;;) case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; case OP_VSPACE: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ @@ -4233,32 +4545,32 @@ for (;;) case OP_NOT_DIGIT: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case OP_DIGIT: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case OP_NOT_WHITESPACE: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case OP_WHITESPACE: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case OP_NOT_WORDCHAR: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case OP_WORDCHAR: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; default: @@ -4274,14 +4586,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (ctype == OP_ANY && IS_NEWLINE(eptr)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); c = *eptr++; switch(ctype) { @@ -4293,7 +4605,7 @@ for (;;) case OP_ANYNL: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; @@ -4304,7 +4616,7 @@ for (;;) case 0x000b: case 0x000c: case 0x0085: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); + if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); break; } break; @@ -4316,14 +4628,14 @@ for (;;) case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; case OP_HSPACE: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ @@ -4340,14 +4652,14 @@ for (;;) case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; case OP_VSPACE: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ @@ -4358,27 +4670,27 @@ for (;;) break; case OP_NOT_DIGIT: - if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH); break; case OP_DIGIT: - if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH); break; case OP_NOT_WHITESPACE: - if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH); break; case OP_WHITESPACE: - if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH); break; case OP_NOT_WORDCHAR: - if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH); break; case OP_WORDCHAR: - if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH); break; default: @@ -4411,7 +4723,7 @@ for (;;) SCHECK_PARTIAL(); break; } - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); if (prop_fail_result) break; eptr+= len; } @@ -4426,7 +4738,7 @@ for (;;) SCHECK_PARTIAL(); break; } - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || @@ -4445,7 +4757,7 @@ for (;;) SCHECK_PARTIAL(); break; } - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) break; @@ -4462,7 +4774,7 @@ for (;;) SCHECK_PARTIAL(); break; } - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) break; @@ -4479,13 +4791,90 @@ for (;;) SCHECK_PARTIAL(); break; } - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) break; eptr+= len; } break; + + case PT_ALNUM: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(c, eptr, len); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || prop_category == ucp_N) + == prop_fail_result) + break; + eptr+= len; + } + break; + + case PT_SPACE: /* Perl space */ + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(c, eptr, len); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + break; + eptr+= len; + } + break; + + case PT_PXSPACE: /* POSIX space */ + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(c, eptr, len); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + break; + eptr+= len; + } + break; + + case PT_WORD: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(c, eptr, len); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || prop_category == ucp_N || + c == CHAR_UNDERSCORE) == prop_fail_result) + break; + eptr+= len; + } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); } /* eptr is now past the end of the maximum run */ @@ -5038,7 +5427,7 @@ for (;;) /* Get here if we can't make it match with any permitted repetitions */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -5071,12 +5460,13 @@ switch (frame->Xwhere) LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) - LBL(53) LBL(54) + LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) #ifdef SUPPORT_UTF8 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30) LBL(32) LBL(34) LBL(42) LBL(46) #ifdef SUPPORT_UCP LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) + LBL(59) LBL(60) LBL(61) LBL(62) #endif /* SUPPORT_UCP */ #endif /* SUPPORT_UTF8 */ default: @@ -5280,6 +5670,7 @@ end_subject = md->end_subject; md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; +md->use_ucp = (re->options & PCRE_UCP) != 0; md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; md->notbol = (options & PCRE_NOTBOL) != 0; @@ -5289,6 +5680,7 @@ md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0; md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; md->hitend = FALSE; +md->mark = NULL; /* In case never set */ md->recursive = NULL; /* No recursion at top level */ @@ -5567,8 +5959,16 @@ for(;;) while (start_match < end_subject) { register unsigned int c = *start_match; - if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; - else break; + if ((start_bits[c/8] & (1 << (c&7))) == 0) + { + start_match++; +#ifdef SUPPORT_UTF8 + if (utf8) + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; +#endif + } + else break; } } } /* Starting optimizations */ @@ -5669,6 +6069,23 @@ for(;;) switch(rc) { + /* SKIP passes back the next starting point explicitly, but if it is the + same as the match we have just done, treat it as NOMATCH. */ + + case MATCH_SKIP: + if (md->start_match_ptr != start_match) + { + new_start_match = md->start_match_ptr; + break; + } + /* Fall through */ + + /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched + the SKIP's arg was not found. We also treat this as NOMATCH. */ + + case MATCH_SKIP_ARG: + /* Fall through */ + /* NOMATCH and PRUNE advance by one character. THEN at this level acts exactly like PRUNE. */ @@ -5683,12 +6100,6 @@ for(;;) #endif break; - /* SKIP passes back the next starting point explicitly. */ - - case MATCH_SKIP: - new_start_match = md->start_match_ptr; - break; - /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ case MATCH_COMMIT: @@ -5734,7 +6145,8 @@ for(;;) md->nllen == 2)) start_match++; - } /* End of for(;;) "bumpalong" loop */ + md->mark = NULL; /* Reset for start of next match attempt */ + } /* End of for(;;) "bumpalong" loop */ /* ==========================================================================*/ @@ -5758,7 +6170,7 @@ capturing parentheses than vector slots. */ ENDLOOP: -if (rc == MATCH_MATCH) +if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) { if (using_temporary_offsets) { @@ -5784,12 +6196,12 @@ if (rc == MATCH_MATCH) if (offsetcount < 2) rc = 0; else { - offsets[0] = md->start_match_ptr - md->start_subject; - offsets[1] = md->end_match_ptr - md->start_subject; + offsets[0] = (int)(md->start_match_ptr - md->start_subject); + offsets[1] = (int)(md->end_match_ptr - md->start_subject); } DPRINTF((">>>> returning %d\n", rc)); - return rc; + goto RETURN_MARK; } /* Control gets here if there has been an error, or if the overall match @@ -5801,26 +6213,43 @@ if (using_temporary_offsets) (pcre_free)(md->offset_vector); } +/* For anything other than nomatch or partial match, just return the code. */ + if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) { DPRINTF((">>>> error: returning %d\n", rc)); return rc; } -else if (start_partial != NULL) + +/* Handle partial matches - disable any mark data */ + +if (start_partial != NULL) { DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); + md->mark = NULL; if (offsetcount > 1) { - offsets[0] = start_partial - (USPTR)subject; - offsets[1] = end_subject - (USPTR)subject; + offsets[0] = (int)(start_partial - (USPTR)subject); + offsets[1] = (int)(end_subject - (USPTR)subject); } - return PCRE_ERROR_PARTIAL; + rc = PCRE_ERROR_PARTIAL; } + +/* This is the classic nomatch case */ + else { DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); - return PCRE_ERROR_NOMATCH; + rc = PCRE_ERROR_NOMATCH; } + +/* Return the MARK data if it has been requested. */ + +RETURN_MARK: + +if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) + *(extra_data->mark) = (unsigned char *)(md->mark); +return rc; } /* End of pcre_exec.c */ diff --git a/harbour/external/pcre/pcreinal.h b/harbour/external/pcre/pcreinal.h index 4554657497..e293602fe6 100644 --- a/harbour/external/pcre/pcreinal.h +++ b/harbour/external/pcre/pcreinal.h @@ -475,7 +475,8 @@ know we are in UTF-8 mode. */ } \ } -/* Get the next character, testing for UTF-8 mode, and advancing the pointer */ +/* Get the next character, testing for UTF-8 mode, and advancing the pointer. +This is called when we don't know if we are in UTF-8 mode. */ #define GETCHARINCTEST(c, eptr) \ c = *eptr++; \ @@ -512,7 +513,7 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */ /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the pointer, incrementing length if there are extra bytes. This is called when we -know we are in UTF-8 mode. */ +do not know if we are in UTF-8 mode. */ #define GETCHARLENTEST(c, eptr, len) \ c = *eptr; \ @@ -580,7 +581,7 @@ time, run time, or study time, respectively. */ PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ - PCRE_JAVASCRIPT_COMPAT) + PCRE_JAVASCRIPT_COMPAT|PCRE_UCP) #define PUBLIC_EXEC_OPTIONS \ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \ @@ -875,6 +876,7 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */ #define STRING_COMMIT0 "COMMIT\0" #define STRING_F0 "F\0" #define STRING_FAIL0 "FAIL\0" +#define STRING_MARK0 "MARK\0" #define STRING_PRUNE0 "PRUNE\0" #define STRING_SKIP0 "SKIP\0" #define STRING_THEN "THEN" @@ -904,6 +906,7 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */ #define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" #define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" #define STRING_UTF8_RIGHTPAR "UTF8)" +#define STRING_UCP_RIGHTPAR "UCP)" #else /* SUPPORT_UTF8 */ @@ -1127,6 +1130,7 @@ only. */ #define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0" #define STRING_F0 STR_F "\0" #define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0" +#define STRING_MARK0 STR_M STR_A STR_R STR_K "\0" #define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0" #define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0" #define STRING_THEN STR_T STR_H STR_E STR_N @@ -1156,6 +1160,7 @@ only. */ #define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS #define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS #define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS +#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS #endif /* SUPPORT_UTF8 */ @@ -1188,9 +1193,13 @@ only. */ #define PT_ANY 0 /* Any property - matches all chars */ #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ -#define PT_GC 2 /* General characteristic (e.g. L) */ -#define PT_PC 3 /* Particular characteristic (e.g. Lu) */ +#define PT_GC 2 /* Specified general characteristic (e.g. L) */ +#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */ #define PT_SC 4 /* Script (e.g. Han) */ +#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */ +#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */ +#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */ +#define PT_WORD 8 /* Word - L plus N plus underscore */ /* Flag bits and data types for the extended class (OP_XCLASS) for classes that contain UTF-8 characters with values greater than 255. */ @@ -1207,9 +1216,15 @@ contain UTF-8 characters with values greater than 255. */ /* These are escaped items that aren't just an encoding of a particular data value such as \n. They must have non-zero values, as check_escape() returns their negation. Also, they must appear in the same order as in the opcode -definitions below, up to ESC_z. There's a dummy for OP_ANY because it -corresponds to "." rather than an escape sequence, and another for OP_ALLANY -(which is used for [^] in JavaScript compatibility mode). +definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it +corresponds to "." in DOTALL mode rather than an escape sequence. It is also +used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves +like \N. + +The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc. +when PCRE_UCP is set, when replacement of \d etc by \p sequences is required. +They must be contiguous, and remain in order so that the replacements can be +looked up from a table. The final escape must be ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc). There are two tests in the code for an escape @@ -1219,11 +1234,12 @@ put in between that don't consume a character, that code will have to change. */ enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, - ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, - ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k, + ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, + ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, + ESC_E, ESC_Q, ESC_g, ESC_k, + ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu, ESC_REF }; - /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in order to the list of escapes immediately above. @@ -1247,8 +1263,8 @@ enum { OP_WHITESPACE, /* 9 \s */ OP_NOT_WORDCHAR, /* 10 \W */ OP_WORDCHAR, /* 11 \w */ - OP_ANY, /* 12 Match any character (subject to DOTALL) */ - OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */ + OP_ANY, /* 12 Match any character except newline */ + OP_ALLANY, /* 13 Match any character */ OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */ OP_NOTPROP, /* 15 \P (not Unicode property) */ OP_PROP, /* 16 \p (Unicode property) */ @@ -1378,20 +1394,24 @@ enum { /* These are backtracking control verbs */ - OP_PRUNE, /* 107 */ - OP_SKIP, /* 108 */ - OP_THEN, /* 109 */ - OP_COMMIT, /* 110 */ + OP_MARK, /* 107 always has an argument */ + OP_PRUNE, /* 108 */ + OP_PRUNE_ARG, /* 109 same, but with argument */ + OP_SKIP, /* 110 */ + OP_SKIP_ARG, /* 111 same, but with argument */ + OP_THEN, /* 112 */ + OP_THEN_ARG, /* 113 same, but with argument */ + OP_COMMIT, /* 114 */ /* These are forced failure and success verbs */ - OP_FAIL, /* 111 */ - OP_ACCEPT, /* 112 */ - OP_CLOSE, /* 113 Used before OP_ACCEPT to close open captures */ + OP_FAIL, /* 115 */ + OP_ACCEPT, /* 116 */ + OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */ /* This is used to skip a subpattern with a {0} quantifier */ - OP_SKIPZERO, /* 114 */ + OP_SKIPZERO, /* 118 */ /* This is not an opcode, but is used to check that tables indexed by opcode are the correct length, in order to catch updating errors - there have been @@ -1402,7 +1422,7 @@ enum { /* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro definitions that follow must also be updated to match. There are also tables -called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */ +called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */ /* This macro defines textual names for all the opcodes. These are used only @@ -1427,7 +1447,8 @@ for debugging. The macro is referenced only in pcre_printint.c. */ "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \ "Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \ "Brazero", "Braminzero", \ - "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \ + "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \ + "*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \ "Close", "Skip zero" @@ -1493,8 +1514,9 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 3, 3, /* RREF, NRREF */ \ 1, /* DEF */ \ 1, 1, /* BRAZERO, BRAMINZERO */ \ - 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \ - 1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ + 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG, */ \ + 1, 3, 1, 3, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */ \ + 1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */ /* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion" @@ -1512,7 +1534,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, - ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT }; + ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERRCOUNT }; /* The real format of the start of the pcre block; the index of names and the code vector run on as long as necessary after the end. We store an explicit @@ -1655,6 +1677,7 @@ typedef struct match_data { BOOL noteol; /* NOTEOL flag */ BOOL utf8; /* UTF8 flag */ BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ + BOOL use_ucp; /* PCRE_UCP flag */ BOOL endonly; /* Dollar not before final \n */ BOOL notempty; /* Empty string match not wanted */ BOOL notempty_atstart; /* Empty string match at start not wanted */ @@ -1674,6 +1697,7 @@ typedef struct match_data { int eptrn; /* Next free eptrblock */ recursion_info *recursive; /* Linked list of recursion data */ void *callout_data; /* To pass back to callouts */ + const uschar *mark; /* Mark pointer to pass back */ } match_data; /* A similar structure is used for the same purpose by the DFA matching diff --git a/harbour/external/pcre/pcreprni.h b/harbour/external/pcre/pcreprni.h index 86b02b5ca4..49d93174cc 100644 --- a/harbour/external/pcre/pcreprni.h +++ b/harbour/external/pcre/pcreprni.h @@ -534,6 +534,14 @@ for(;;) } break; + case OP_MARK: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + fprintf(f, " %s %s", OP_names[*code], code + 2); + extra += code[1]; + break; + /* Anything else is just an item with no data*/ default: diff --git a/harbour/external/pcre/pcrestud.c b/harbour/external/pcre/pcrestud.c index 1730d945b8..54e65ad98c 100644 --- a/harbour/external/pcre/pcrestud.c +++ b/harbour/external/pcre/pcrestud.c @@ -48,6 +48,7 @@ supporting functions. */ #include "pcreinal.h" +#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7)) /* Returns from set_start_bits() */ @@ -413,6 +414,15 @@ for (;;) #endif break; + /* Skip these, but we need to add in the name length. */ + + case OP_MARK: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + cc += _pcre_OP_lengths[op] + cc[1]; + break; + /* For the record, these are the opcodes that are matched by "default": OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP, OP_THEN. */ @@ -431,25 +441,121 @@ for (;;) * Set a bit and maybe its alternate case * *************************************************/ -/* Given a character, set its bit in the table, and also the bit for the other -version of a letter if we are caseless. +/* Given a character, set its first byte's bit in the table, and also the +corresponding bit for the other version of a letter if we are caseless. In +UTF-8 mode, for characters greater than 127, we can only do the caseless thing +when Unicode property support is available. Arguments: start_bits points to the bit map - c is the character + p points to the character caseless the caseless flag cd the block with char table pointers + utf8 TRUE for UTF-8 mode -Returns: nothing +Returns: pointer after the character +*/ + +static const uschar * +set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless, + compile_data *cd, BOOL utf8) +{ +unsigned int c = *p; + +SET_BIT(c); + +#ifdef SUPPORT_UTF8 +if (utf8 && c > 127) + { + GETCHARINC(c, p); +#ifdef SUPPORT_UCP + if (caseless) + { + uschar buff[8]; + c = UCD_OTHERCASE(c); + (void)_pcre_ord2utf8(c, buff); + SET_BIT(buff[0]); + } +#endif + return p; + } +#endif + +/* Not UTF-8 mode, or character is less than 127. */ + +if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); +return p + 1; +} + + + +/************************************************* +* Set bits for a positive character type * +*************************************************/ + +/* This function sets starting bits for a character type. In UTF-8 mode, we can +only do a direct setting for bytes less than 128, as otherwise there can be +confusion with bytes in the middle of UTF-8 characters. In a "traditional" +environment, the tables will only recognize ASCII characters anyway, but in at +least one Windows environment, some higher bytes bits were set in the tables. +So we deal with that case by considering the UTF-8 encoding. + +Arguments: + start_bits the starting bitmap + cbit type the type of character wanted + table_limit 32 for non-UTF-8; 16 for UTF-8 + cd the block with char table pointers + +Returns: nothing */ static void -set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless, +set_type_bits(uschar *start_bits, int cbit_type, int table_limit, compile_data *cd) { -start_bits[c/8] |= (1 << (c&7)); -if (caseless && (cd->ctypes[c] & ctype_letter) != 0) - start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); +register int c; +for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type]; +if (table_limit == 32) return; +for (c = 128; c < 256; c++) + { + if ((cd->cbits[c/8] & (1 << (c&7))) != 0) + { + uschar buff[8]; + (void)_pcre_ord2utf8(c, buff); + SET_BIT(buff[0]); + } + } +} + + +/************************************************* +* Set bits for a negative character type * +*************************************************/ + +/* This function sets starting bits for a negative character type such as \D. +In UTF-8 mode, we can only do a direct setting for bytes less than 128, as +otherwise there can be confusion with bytes in the middle of UTF-8 characters. +Unlike in the positive case, where we can set appropriate starting bits for +specific high-valued UTF-8 characters, in this case we have to set the bits for +all high-valued characters. The lowest is 0xc2, but we overkill by starting at +0xc0 (192) for simplicity. + +Arguments: + start_bits the starting bitmap + cbit type the type of character wanted + table_limit 32 for non-UTF-8; 16 for UTF-8 + cd the block with char table pointers + +Returns: nothing +*/ + +static void +set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit, + compile_data *cd) +{ +register int c; +for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type]; +if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff; } @@ -484,6 +590,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, { register int c; int yield = SSB_DONE; +int table_limit = utf8? 16:32; #if 0 /* ========================================================================= */ @@ -607,12 +714,7 @@ do case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: - set_table_bit(start_bits, tcode[1], caseless, cd); - tcode += 2; -#ifdef SUPPORT_UTF8 - if (utf8 && tcode[-1] >= 0xc0) - tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; -#endif + tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8); break; /* Single-char upto sets the bit and tries the next */ @@ -620,12 +722,7 @@ do case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: - set_table_bit(start_bits, tcode[3], caseless, cd); - tcode += 4; -#ifdef SUPPORT_UTF8 - if (utf8 && tcode[-1] >= 0xc0) - tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; -#endif + tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8); break; /* At least one single char sets the bit and stops */ @@ -638,59 +735,86 @@ do case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: - set_table_bit(start_bits, tcode[1], caseless, cd); + (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8); try_next = FALSE; break; - /* Single character type sets the bits and stops */ + /* Special spacing and line-terminating items. These recognize specific + lists of characters. The difference between VSPACE and ANYNL is that the + latter can match the two-character CRLF sequence, but that is not + relevant for finding the first character, so their code here is + identical. */ + + case OP_HSPACE: + SET_BIT(0x09); + SET_BIT(0x20); + if (utf8) + { + SET_BIT(0xC2); /* For U+00A0 */ + SET_BIT(0xE1); /* For U+1680, U+180E */ + SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ + SET_BIT(0xE3); /* For U+3000 */ + } + else SET_BIT(0xA0); + try_next = FALSE; + break; + + case OP_ANYNL: + case OP_VSPACE: + SET_BIT(0x0A); + SET_BIT(0x0B); + SET_BIT(0x0C); + SET_BIT(0x0D); + if (utf8) + { + SET_BIT(0xC2); /* For U+0085 */ + SET_BIT(0xE2); /* For U+2028, U+2029 */ + } + else SET_BIT(0x85); + try_next = FALSE; + break; + + /* Single character types set the bits and stop. Note that if PCRE_UCP + is set, we do not see these op codes because \d etc are converted to + properties. Therefore, these apply in the case when only characters less + than 256 are recognized to match the types. */ case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) - start_bits[c] |= ~cd->cbits[c+cbit_digit]; + set_nottype_bits(start_bits, cbit_digit, table_limit, cd); try_next = FALSE; break; case OP_DIGIT: - for (c = 0; c < 32; c++) - start_bits[c] |= cd->cbits[c+cbit_digit]; + set_type_bits(start_bits, cbit_digit, table_limit, cd); try_next = FALSE; break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + ensure it is set as not whitespace. */ case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= ~d; - } + set_nottype_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] |= 0x08; try_next = FALSE; break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + not set it from the table. */ case OP_WHITESPACE: - for (c = 0; c < 32; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= d; - } + c = start_bits[1]; /* Save in case it was already set */ + set_type_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] = (start_bits[1] & ~0x08) | c; try_next = FALSE; break; case OP_NOT_WORDCHAR: - for (c = 0; c < 32; c++) - start_bits[c] |= ~cd->cbits[c+cbit_word]; + set_nottype_bits(start_bits, cbit_word, table_limit, cd); try_next = FALSE; break; case OP_WORDCHAR: - for (c = 0; c < 32; c++) - start_bits[c] |= cd->cbits[c+cbit_word]; + set_type_bits(start_bits, cbit_word, table_limit, cd); try_next = FALSE; break; @@ -699,6 +823,7 @@ do case OP_TYPEPLUS: case OP_TYPEMINPLUS: + case OP_TYPEPOSPLUS: tcode++; break; @@ -722,52 +847,69 @@ do case OP_TYPEPOSQUERY: switch(tcode[1]) { + default: case OP_ANY: case OP_ALLANY: return SSB_FAIL; + case OP_HSPACE: + SET_BIT(0x09); + SET_BIT(0x20); + if (utf8) + { + SET_BIT(0xC2); /* For U+00A0 */ + SET_BIT(0xE1); /* For U+1680, U+180E */ + SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ + SET_BIT(0xE3); /* For U+3000 */ + } + else SET_BIT(0xA0); + break; + + case OP_ANYNL: + case OP_VSPACE: + SET_BIT(0x0A); + SET_BIT(0x0B); + SET_BIT(0x0C); + SET_BIT(0x0D); + if (utf8) + { + SET_BIT(0xC2); /* For U+0085 */ + SET_BIT(0xE2); /* For U+2028, U+2029 */ + } + else SET_BIT(0x85); + break; + case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) - start_bits[c] |= ~cd->cbits[c+cbit_digit]; + set_nottype_bits(start_bits, cbit_digit, table_limit, cd); break; case OP_DIGIT: - for (c = 0; c < 32; c++) - start_bits[c] |= cd->cbits[c+cbit_digit]; + set_type_bits(start_bits, cbit_digit, table_limit, cd); break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + ensure it gets set as not whitespace. */ case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= ~d; - } + set_nottype_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] |= 0x08; break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + avoid setting it. */ case OP_WHITESPACE: - for (c = 0; c < 32; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= d; - } + c = start_bits[1]; /* Save in case it was already set */ + set_type_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] = (start_bits[1] & ~0x08) | c; break; case OP_NOT_WORDCHAR: - for (c = 0; c < 32; c++) - start_bits[c] |= ~cd->cbits[c+cbit_word]; + set_nottype_bits(start_bits, cbit_word, table_limit, cd); break; case OP_WORDCHAR: - for (c = 0; c < 32; c++) - start_bits[c] |= cd->cbits[c+cbit_word]; + set_type_bits(start_bits, cbit_word, table_limit, cd); break; } diff --git a/harbour/external/pcre/pcretabs.c b/harbour/external/pcre/pcretabs.c index 55c713345a..6fca301417 100644 --- a/harbour/external/pcre/pcretabs.c +++ b/harbour/external/pcre/pcretabs.c @@ -243,6 +243,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */ #define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0" #define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0" #define STRING_Vai0 STR_V STR_a STR_i "\0" +#define STRING_Xan0 STR_X STR_a STR_n "\0" +#define STRING_Xps0 STR_X STR_p STR_s "\0" +#define STRING_Xsp0 STR_X STR_s STR_p "\0" +#define STRING_Xwd0 STR_X STR_w STR_d "\0" #define STRING_Yi0 STR_Y STR_i "\0" #define STRING_Z0 STR_Z "\0" #define STRING_Zl0 STR_Z STR_l "\0" @@ -376,6 +380,10 @@ const char _pcre_utt_names[] = STRING_Tifinagh0 STRING_Ugaritic0 STRING_Vai0 + STRING_Xan0 + STRING_Xps0 + STRING_Xsp0 + STRING_Xwd0 STRING_Yi0 STRING_Z0 STRING_Zl0 @@ -509,11 +517,15 @@ const ucp_type_table _pcre_utt[] = { { 891, PT_SC, ucp_Tifinagh }, { 900, PT_SC, ucp_Ugaritic }, { 909, PT_SC, ucp_Vai }, - { 913, PT_SC, ucp_Yi }, - { 916, PT_GC, ucp_Z }, - { 918, PT_PC, ucp_Zl }, - { 921, PT_PC, ucp_Zp }, - { 924, PT_PC, ucp_Zs } + { 913, PT_ALNUM, 0 }, + { 917, PT_PXSPACE, 0 }, + { 921, PT_SPACE, 0 }, + { 925, PT_WORD, 0 }, + { 929, PT_SC, ucp_Yi }, + { 932, PT_GC, ucp_Z }, + { 934, PT_PC, ucp_Zl }, + { 937, PT_PC, ucp_Zp }, + { 940, PT_PC, ucp_Zs } }; const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table); diff --git a/harbour/external/pcre/pcrexcls.c b/harbour/external/pcre/pcrexcls.c index f1fdba9779..96fd925e04 100644 --- a/harbour/external/pcre/pcrexcls.c +++ b/harbour/external/pcre/pcrexcls.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2010 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -112,12 +112,13 @@ while ((t = *data++) != XCL_END) break; case PT_LAMP: - if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt) == - (t == XCL_PROP)) return !negated; + if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; break; case PT_GC: - if ((data[1] == _pcre_ucp_gentype[prop->chartype]) == (t == XCL_PROP)) return !negated; + if ((data[1] == _pcre_ucp_gentype[prop->chartype]) == (t == XCL_PROP)) + return !negated; break; case PT_PC: @@ -128,6 +129,33 @@ while ((t = *data++) != XCL_END) if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated; break; + case PT_ALNUM: + if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N) == (t == XCL_PROP)) + return !negated; + break; + + case PT_SPACE: /* Perl space */ + if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) + == (t == XCL_PROP)) + return !negated; + break; + + case PT_PXSPACE: /* POSIX space */ + if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) + return !negated; + break; + + case PT_WORD: + if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || + _pcre_ucp_gentype[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) + == (t == XCL_PROP)) + return !negated; + break; + /* This should never occur, but compilers may mutter if there is no default. */