2012-02-07 22:25 UTC+0100 Viktor Szakats (harbour syenar.net)
* src/3rd/pcre/Makefile
* src/3rd/pcre/*
+ src/3rd/pcre/pcrejitc.c
* 8.12 -> 8.21 (using hb3rdpat)
; NOTE: New JIT feature off by default, not enabled
This commit is contained in:
@@ -16,6 +16,13 @@
|
||||
The license applies to all entries newer than 2009-04-28.
|
||||
*/
|
||||
|
||||
2012-02-07 22:25 UTC+0100 Viktor Szakats (harbour syenar.net)
|
||||
* src/3rd/pcre/Makefile
|
||||
* src/3rd/pcre/*
|
||||
+ src/3rd/pcre/pcrejitc.c
|
||||
* 8.12 -> 8.21 (using hb3rdpat)
|
||||
; NOTE: New JIT feature off by default, not enabled
|
||||
|
||||
2012-02-07 22:10 UTC+0100 Viktor Szakats (harbour syenar.net)
|
||||
* src/3rd/jpeg/Makefile
|
||||
* src/3rd/jpeg/*
|
||||
|
||||
@@ -9,7 +9,9 @@ specified below. The documentation for PCRE, supplied in the "doc"
|
||||
directory, is distributed under the same terms as the software itself.
|
||||
|
||||
The basic library functions are written in C and are freestanding. Also
|
||||
included in the distribution is a set of C++ wrapper functions.
|
||||
included in the distribution is a set of C++ wrapper functions, and a
|
||||
just-in-time compiler that can be used to optimize pattern matching. These
|
||||
are both optional features that can be omitted when the library is built.
|
||||
|
||||
|
||||
THE BASIC LIBRARY FUNCTIONS
|
||||
@@ -22,7 +24,29 @@ Email domain: cam.ac.uk
|
||||
University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
Copyright (c) 1997-2011 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
|
||||
PCRE JUST-IN-TIME COMPILATION SUPPORT
|
||||
-------------------------------------
|
||||
|
||||
Written by: Zoltan Herczeg
|
||||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2011 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
STACK-LESS JUST-IN-TIME COMPILER
|
||||
--------------------------------
|
||||
|
||||
Written by: Zoltan Herczeg
|
||||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2011 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
@@ -31,7 +55,7 @@ THE C++ WRAPPER FUNCTIONS
|
||||
|
||||
Contributed by: Google Inc.
|
||||
|
||||
Copyright (c) 2007-2010, Google Inc.
|
||||
Copyright (c) 2007-2011, Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ C_SOURCES := \
|
||||
pcreget.c \
|
||||
pcreglob.c \
|
||||
pcreinfo.c \
|
||||
pcrejitc.c \
|
||||
pcremktb.c \
|
||||
pcrenewl.c \
|
||||
pcreoutf.c \
|
||||
@@ -72,8 +73,8 @@ else
|
||||
endif
|
||||
|
||||
# ORIGIN http://www.pcre.org/
|
||||
# VER 8.12
|
||||
# URL ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.12.tar.gz
|
||||
# VER 8.21
|
||||
# URL ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.21.tar.gz
|
||||
# DIFF pcre.dif
|
||||
#
|
||||
# MAP LICENCE
|
||||
@@ -90,6 +91,7 @@ endif
|
||||
# MAP pcre_get.c pcreget.c
|
||||
# MAP pcre_globals.c pcreglob.c
|
||||
# MAP pcre_info.c pcreinfo.c
|
||||
# MAP pcre_jit_compile.c pcrejitc.c
|
||||
# MAP pcre_maketables.c pcremktb.c
|
||||
# MAP pcre_newline.c pcrenewl.c
|
||||
# MAP pcre_ord2utf8.c pcreoutf.c
|
||||
|
||||
@@ -250,7 +250,7 @@ them both to 0; an emulation function will be used. */
|
||||
#define PACKAGE_NAME "PCRE"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "PCRE 8.12"
|
||||
#define PACKAGE_STRING "PCRE 8.21"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "pcre"
|
||||
@@ -259,7 +259,17 @@ them both to 0; an emulation function will be used. */
|
||||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "8.12"
|
||||
#define PACKAGE_VERSION "8.21"
|
||||
|
||||
/* The value of PCREGREP_BUFSIZE determines the size of buffer used by
|
||||
pcregrep to hold parts of the file it is searching. On systems that support
|
||||
it, "configure" can be used to override the default, which is 8192. This is
|
||||
also the minimum value. The actual amount of memory used by pcregrep is
|
||||
three times this number, because it allows for the buffering of "before"
|
||||
and "after" lines. */
|
||||
#ifndef PCREGREP_BUFSIZE
|
||||
#define PCREGREP_BUFSIZE 20480
|
||||
#endif
|
||||
|
||||
|
||||
/* If you are compiling for a system other than a Unix-like system or
|
||||
@@ -293,6 +303,9 @@ them both to 0; an emulation function will be used. */
|
||||
#define STDC_HEADERS 1
|
||||
#endif
|
||||
|
||||
/* Define to enable support for Just-In-Time compiling. */
|
||||
/* #undef SUPPORT_JIT */
|
||||
|
||||
/* Define to allow pcregrep to be linked with libbz2, so that it is able to
|
||||
handle .bz2 files. */
|
||||
/* #undef SUPPORT_LIBBZ2 */
|
||||
@@ -304,7 +317,10 @@ them both to 0; an emulation function will be used. */
|
||||
handle .gz files. */
|
||||
/* #undef SUPPORT_LIBZ */
|
||||
|
||||
/* Define to enable support for Unicode properties */
|
||||
/* Define to enable JIT support in pcregrep. */
|
||||
/* #undef SUPPORT_PCREGREP_JIT */
|
||||
|
||||
/* Define to enable support for Unicode properties. */
|
||||
/* #undef SUPPORT_UCP */
|
||||
|
||||
/* Define to enable support for the UTF-8 Unicode encoding. This will work
|
||||
@@ -315,7 +331,7 @@ them both to 0; an emulation function will be used. */
|
||||
|
||||
/* Version number of package */
|
||||
#ifndef VERSION
|
||||
#define VERSION "8.12"
|
||||
#define VERSION "8.21"
|
||||
#endif
|
||||
|
||||
/* Define to empty if `const' does not conform to ANSI C. */
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
diff -urN pcre.orig/pcrefinf.c pcre/pcrefinf.c
|
||||
--- pcre.orig/pcrefinf.c 2011-01-15 18:09:50.426241817 +0100
|
||||
+++ pcre/pcrefinf.c 2011-01-15 18:09:50.696242981 +0100
|
||||
@@ -126,7 +126,7 @@
|
||||
diff -urN pcre.orig\pcrefinf.c pcre\pcrefinf.c
|
||||
--- pcre.orig\pcrefinf.c Tue Feb 07 22:22:32 2012
|
||||
+++ pcre\pcrefinf.c Tue Feb 07 22:22:32 2012
|
||||
@@ -139,7 +139,7 @@
|
||||
case PCRE_INFO_MINLENGTH:
|
||||
*((int *)where) =
|
||||
(study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0)?
|
||||
@@ -9,10 +9,10 @@ diff -urN pcre.orig/pcrefinf.c pcre/pcrefinf.c
|
||||
+ (int)study->minlength : -1;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_LASTLITERAL:
|
||||
diff -urN pcre.orig/pcreglob.c pcre/pcreglob.c
|
||||
--- pcre.orig/pcreglob.c 2011-01-15 18:09:50.446241418 +0100
|
||||
+++ pcre/pcreglob.c 2011-01-15 18:09:50.696242981 +0100
|
||||
case PCRE_INFO_JIT:
|
||||
diff -urN pcre.orig\pcreglob.c pcre\pcreglob.c
|
||||
--- pcre.orig\pcreglob.c Tue Feb 07 22:22:32 2012
|
||||
+++ pcre\pcreglob.c Tue Feb 07 22:22:32 2012
|
||||
@@ -74,11 +74,17 @@
|
||||
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
/* This is the public header file for the PCRE library, to be #included by
|
||||
applications that call the PCRE functions.
|
||||
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
Copyright (c) 1997-2011 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
/* The current PCRE version information. */
|
||||
|
||||
#define PCRE_MAJOR 8
|
||||
#define PCRE_MINOR 12
|
||||
#define PCRE_MINOR 21
|
||||
#define PCRE_PRERELEASE
|
||||
#define PCRE_DATE 2011-01-15
|
||||
#define PCRE_DATE 2011-12-12
|
||||
|
||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE, the appropriate
|
||||
@@ -98,20 +98,25 @@ extern "C" {
|
||||
/* Options. Some are compile-time only, some are run-time only, and some are
|
||||
both, so we keep them all distinct. However, almost all the bits in the options
|
||||
word are now used. In the long run, we may have to re-use some of the
|
||||
compile-time only bits for runtime options, or vice versa. */
|
||||
compile-time only bits for runtime options, or vice versa. In the comments
|
||||
below, "compile", "exec", and "DFA exec" mean that the option is permitted to
|
||||
be set for those functions; "used in" means that an option may be set only for
|
||||
compile, but is subsequently referenced in exec and/or DFA exec. Any of the
|
||||
compile-time options may be inspected during studying (and therefore JIT
|
||||
compiling). */
|
||||
|
||||
#define PCRE_CASELESS 0x00000001 /* Compile */
|
||||
#define PCRE_MULTILINE 0x00000002 /* Compile */
|
||||
#define PCRE_DOTALL 0x00000004 /* Compile */
|
||||
#define PCRE_EXTENDED 0x00000008 /* Compile */
|
||||
#define PCRE_ANCHORED 0x00000010 /* Compile, exec, DFA exec */
|
||||
#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile */
|
||||
#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile, used in exec, DFA exec */
|
||||
#define PCRE_EXTRA 0x00000040 /* Compile */
|
||||
#define PCRE_NOTBOL 0x00000080 /* Exec, DFA exec */
|
||||
#define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */
|
||||
#define PCRE_UNGREEDY 0x00000200 /* Compile */
|
||||
#define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */
|
||||
#define PCRE_UTF8 0x00000800 /* Compile */
|
||||
#define PCRE_UTF8 0x00000800 /* Compile, used in exec, DFA exec */
|
||||
#define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */
|
||||
#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */
|
||||
@@ -119,7 +124,7 @@ compile-time only bits for runtime options, or vice versa. */
|
||||
#define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */
|
||||
#define PCRE_DFA_SHORTEST 0x00010000 /* DFA exec */
|
||||
#define PCRE_DFA_RESTART 0x00020000 /* DFA exec */
|
||||
#define PCRE_FIRSTLINE 0x00040000 /* Compile */
|
||||
#define PCRE_FIRSTLINE 0x00040000 /* Compile, used in exec, DFA exec */
|
||||
#define PCRE_DUPNAMES 0x00080000 /* Compile */
|
||||
#define PCRE_NEWLINE_CR 0x00100000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_NEWLINE_LF 0x00200000 /* Compile, exec, DFA exec */
|
||||
@@ -128,12 +133,12 @@ compile-time only bits for runtime options, or vice versa. */
|
||||
#define PCRE_NEWLINE_ANYCRLF 0x00500000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_BSR_ANYCRLF 0x00800000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_BSR_UNICODE 0x01000000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile */
|
||||
#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile, used in exec */
|
||||
#define PCRE_NO_START_OPTIMIZE 0x04000000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_NO_START_OPTIMISE 0x04000000 /* Synonym */
|
||||
#define PCRE_PARTIAL_HARD 0x08000000 /* Exec, DFA exec */
|
||||
#define PCRE_NOTEMPTY_ATSTART 0x10000000 /* Exec, DFA exec */
|
||||
#define PCRE_UCP 0x20000000 /* Compile */
|
||||
#define PCRE_UCP 0x20000000 /* Compile, used in exec, DFA exec */
|
||||
|
||||
/* Exec-time and get/set-time error codes */
|
||||
|
||||
@@ -163,6 +168,33 @@ compile-time only bits for runtime options, or vice versa. */
|
||||
#define PCRE_ERROR_BADNEWLINE (-23)
|
||||
#define PCRE_ERROR_BADOFFSET (-24)
|
||||
#define PCRE_ERROR_SHORTUTF8 (-25)
|
||||
#define PCRE_ERROR_RECURSELOOP (-26)
|
||||
#define PCRE_ERROR_JIT_STACKLIMIT (-27)
|
||||
|
||||
/* Specific error codes for UTF-8 validity checks */
|
||||
|
||||
#define PCRE_UTF8_ERR0 0
|
||||
#define PCRE_UTF8_ERR1 1
|
||||
#define PCRE_UTF8_ERR2 2
|
||||
#define PCRE_UTF8_ERR3 3
|
||||
#define PCRE_UTF8_ERR4 4
|
||||
#define PCRE_UTF8_ERR5 5
|
||||
#define PCRE_UTF8_ERR6 6
|
||||
#define PCRE_UTF8_ERR7 7
|
||||
#define PCRE_UTF8_ERR8 8
|
||||
#define PCRE_UTF8_ERR9 9
|
||||
#define PCRE_UTF8_ERR10 10
|
||||
#define PCRE_UTF8_ERR11 11
|
||||
#define PCRE_UTF8_ERR12 12
|
||||
#define PCRE_UTF8_ERR13 13
|
||||
#define PCRE_UTF8_ERR14 14
|
||||
#define PCRE_UTF8_ERR15 15
|
||||
#define PCRE_UTF8_ERR16 16
|
||||
#define PCRE_UTF8_ERR17 17
|
||||
#define PCRE_UTF8_ERR18 18
|
||||
#define PCRE_UTF8_ERR19 19
|
||||
#define PCRE_UTF8_ERR20 20
|
||||
#define PCRE_UTF8_ERR21 21
|
||||
|
||||
/* Request types for pcre_fullinfo() */
|
||||
|
||||
@@ -183,6 +215,8 @@ compile-time only bits for runtime options, or vice versa. */
|
||||
#define PCRE_INFO_JCHANGED 13
|
||||
#define PCRE_INFO_HASCRORLF 14
|
||||
#define PCRE_INFO_MINLENGTH 15
|
||||
#define PCRE_INFO_JIT 16
|
||||
#define PCRE_INFO_JITSIZE 17
|
||||
|
||||
/* Request types for pcre_config(). Do not re-arrange, in order to remain
|
||||
compatible. */
|
||||
@@ -196,6 +230,12 @@ compatible. */
|
||||
#define PCRE_CONFIG_UNICODE_PROPERTIES 6
|
||||
#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
|
||||
#define PCRE_CONFIG_BSR 8
|
||||
#define PCRE_CONFIG_JIT 9
|
||||
|
||||
/* Request types for pcre_study(). Do not re-arrange, in order to remain
|
||||
compatible. */
|
||||
|
||||
#define PCRE_STUDY_JIT_COMPILE 0x0001
|
||||
|
||||
/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine
|
||||
these bits, just add new ones on the end, in order to remain compatible. */
|
||||
@@ -206,12 +246,16 @@ these bits, just add new ones on the end, in order to remain compatible. */
|
||||
#define PCRE_EXTRA_TABLES 0x0008
|
||||
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
|
||||
#define PCRE_EXTRA_MARK 0x0020
|
||||
#define PCRE_EXTRA_EXECUTABLE_JIT 0x0040
|
||||
|
||||
/* Types */
|
||||
|
||||
struct real_pcre; /* declaration; the definition is private */
|
||||
typedef struct real_pcre pcre;
|
||||
|
||||
struct real_pcre_jit_stack; /* declaration; the definition is private */
|
||||
typedef struct real_pcre_jit_stack pcre_jit_stack;
|
||||
|
||||
/* When PCRE is compiled as a C++ library, the subject pointer type can be
|
||||
replaced with a custom type. For conventional use, the public interface is a
|
||||
const char *. */
|
||||
@@ -232,6 +276,7 @@ typedef struct pcre_extra {
|
||||
const unsigned char *tables; /* Pointer to character tables */
|
||||
unsigned long int match_limit_recursion; /* Max recursive calls to match() */
|
||||
unsigned char **mark; /* For passing back a mark pointer */
|
||||
void *executable_jit; /* Contains a pointer to a compiled jit code */
|
||||
} pcre_extra;
|
||||
|
||||
/* The structure for passing out data via the pcre_callout_function. We use a
|
||||
@@ -254,6 +299,8 @@ typedef struct pcre_callout_block {
|
||||
/* ------------------- Added for Version 1 -------------------------- */
|
||||
int pattern_position; /* Offset to next item in the pattern */
|
||||
int next_item_length; /* Length of next item in the pattern */
|
||||
/* ------------------- Added for Version 2 -------------------------- */
|
||||
const unsigned char *mark; /* Pointer to current mark or NULL */
|
||||
/* ------------------------------------------------------------------ */
|
||||
} pcre_callout_block;
|
||||
|
||||
@@ -277,6 +324,10 @@ PCRE_EXP_DECL void pcre_stack_free(void *);
|
||||
PCRE_EXP_DECL int pcre_callout(pcre_callout_block *);
|
||||
#endif /* VPCOMPAT */
|
||||
|
||||
/* User defined callback which provides a stack just before the match starts. */
|
||||
|
||||
typedef pcre_jit_stack *(*pcre_jit_callback)(void *);
|
||||
|
||||
/* Exported PCRE functions */
|
||||
|
||||
PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *,
|
||||
@@ -309,8 +360,15 @@ PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *);
|
||||
PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
|
||||
PCRE_EXP_DECL int pcre_refcount(pcre *, int);
|
||||
PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
|
||||
PCRE_EXP_DECL void pcre_free_study(pcre_extra *);
|
||||
PCRE_EXP_DECL const char *pcre_version(void);
|
||||
|
||||
/* JIT compiler related functions. */
|
||||
|
||||
PCRE_EXP_DECL pcre_jit_stack *pcre_jit_stack_alloc(int, int);
|
||||
PCRE_EXP_DECL void pcre_jit_stack_free(pcre_jit_stack *);
|
||||
PCRE_EXP_DECL void pcre_assign_jit_stack(pcre_extra *, pcre_jit_callback, void *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
Copyright (c) 1997-2011 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -83,6 +83,14 @@ switch (what)
|
||||
#endif
|
||||
break;
|
||||
|
||||
case PCRE_CONFIG_JIT:
|
||||
#ifdef SUPPORT_JIT
|
||||
*((int *)where) = 1;
|
||||
#else
|
||||
*((int *)where) = 0;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case PCRE_CONFIG_NEWLINE:
|
||||
*((int *)where) = NEWLINE;
|
||||
break;
|
||||
|
||||
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language (but see
|
||||
below for why this module is different).
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
Copyright (c) 1997-2011 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -121,18 +121,25 @@ static const uschar coptable[] = {
|
||||
0, 0, /* \P, \p */
|
||||
0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
|
||||
0, /* \X */
|
||||
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
|
||||
0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
|
||||
1, /* Char */
|
||||
1, /* Charnc */
|
||||
1, /* Chari */
|
||||
1, /* not */
|
||||
1, /* noti */
|
||||
/* Positive single-char repeats */
|
||||
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
|
||||
3, 3, 3, /* upto, minupto, exact */
|
||||
1, 1, 1, 3, /* *+, ++, ?+, upto+ */
|
||||
1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
|
||||
3, 3, 3, /* upto I, minupto I, exact I */
|
||||
1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
|
||||
/* Negative single-char repeats - only for chars < 256 */
|
||||
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
|
||||
3, 3, 3, /* NOT upto, minupto, exact */
|
||||
1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
|
||||
1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
|
||||
1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
|
||||
3, 3, 3, /* NOT upto I, minupto I, exact I */
|
||||
1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
|
||||
/* Positive type repeats */
|
||||
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
|
||||
3, 3, 3, /* Type upto, minupto, exact */
|
||||
@@ -144,26 +151,30 @@ static const uschar coptable[] = {
|
||||
0, /* NCLASS */
|
||||
0, /* XCLASS - variable length */
|
||||
0, /* REF */
|
||||
0, /* REFI */
|
||||
0, /* RECURSE */
|
||||
0, /* CALLOUT */
|
||||
0, /* Alt */
|
||||
0, /* Ket */
|
||||
0, /* KetRmax */
|
||||
0, /* KetRmin */
|
||||
0, /* KetRpos */
|
||||
0, /* Reverse */
|
||||
0, /* Assert */
|
||||
0, /* Assert not */
|
||||
0, /* Assert behind */
|
||||
0, /* Assert behind not */
|
||||
0, /* Reverse */
|
||||
0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
|
||||
0, 0, 0, /* SBRA, SCBRA, SCOND */
|
||||
0, 0, /* ONCE, ONCE_NC */
|
||||
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
|
||||
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
|
||||
0, 0, /* CREF, NCREF */
|
||||
0, 0, /* RREF, NRREF */
|
||||
0, /* DEF */
|
||||
0, 0, /* BRAZERO, BRAMINZERO */
|
||||
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
|
||||
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
|
||||
0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
|
||||
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
|
||||
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
|
||||
0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
|
||||
0, 0 /* CLOSE, SKIPZERO */
|
||||
};
|
||||
|
||||
/* This table identifies those opcodes that inspect a character. It is used to
|
||||
@@ -179,18 +190,25 @@ static const uschar poptable[] = {
|
||||
1, 1, /* \P, \p */
|
||||
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
|
||||
1, /* \X */
|
||||
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
|
||||
0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
|
||||
1, /* Char */
|
||||
1, /* Charnc */
|
||||
1, /* Chari */
|
||||
1, /* not */
|
||||
1, /* noti */
|
||||
/* Positive single-char repeats */
|
||||
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
|
||||
1, 1, 1, /* upto, minupto, exact */
|
||||
1, 1, 1, 1, /* *+, ++, ?+, upto+ */
|
||||
1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
|
||||
1, 1, 1, /* upto I, minupto I, exact I */
|
||||
1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
|
||||
/* Negative single-char repeats - only for chars < 256 */
|
||||
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
|
||||
1, 1, 1, /* NOT upto, minupto, exact */
|
||||
1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
|
||||
1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
|
||||
1, 1, 1, /* NOT upto I, minupto I, exact I */
|
||||
1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
|
||||
/* Positive type repeats */
|
||||
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
|
||||
1, 1, 1, /* Type upto, minupto, exact */
|
||||
@@ -202,26 +220,30 @@ static const uschar poptable[] = {
|
||||
1, /* NCLASS */
|
||||
1, /* XCLASS - variable length */
|
||||
0, /* REF */
|
||||
0, /* REFI */
|
||||
0, /* RECURSE */
|
||||
0, /* CALLOUT */
|
||||
0, /* Alt */
|
||||
0, /* Ket */
|
||||
0, /* KetRmax */
|
||||
0, /* KetRmin */
|
||||
0, /* KetRpos */
|
||||
0, /* Reverse */
|
||||
0, /* Assert */
|
||||
0, /* Assert not */
|
||||
0, /* Assert behind */
|
||||
0, /* Assert behind not */
|
||||
0, /* Reverse */
|
||||
0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
|
||||
0, 0, 0, /* SBRA, SCBRA, SCOND */
|
||||
0, 0, /* ONCE, ONCE_NC */
|
||||
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
|
||||
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
|
||||
0, 0, /* CREF, NCREF */
|
||||
0, 0, /* RREF, NRREF */
|
||||
0, /* DEF */
|
||||
0, 0, /* BRAZERO, BRAMINZERO */
|
||||
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
|
||||
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
|
||||
0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
|
||||
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
|
||||
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
|
||||
0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
|
||||
0, 0 /* CLOSE, SKIPZERO */
|
||||
};
|
||||
|
||||
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
|
||||
@@ -252,7 +274,6 @@ these structures in, is a vector of ints. */
|
||||
typedef struct stateblock {
|
||||
int offset; /* Offset to opcode */
|
||||
int count; /* Count for repeats */
|
||||
int ims; /* ims flag bits */
|
||||
int data; /* Some use extra data */
|
||||
} stateblock;
|
||||
|
||||
@@ -308,9 +329,7 @@ Arguments:
|
||||
offsetcount size of same
|
||||
workspace vector of workspace
|
||||
wscount size of same
|
||||
ims the current ims flags
|
||||
rlevel function call recursion level
|
||||
recursing regex recursive call level
|
||||
|
||||
Returns: > 0 => number of match offset pairs placed in offsets
|
||||
= 0 => offsets overflowed; longest matches are present
|
||||
@@ -325,7 +344,6 @@ for the current character, one for the following character). */
|
||||
{ \
|
||||
next_active_state->offset = (x); \
|
||||
next_active_state->count = (y); \
|
||||
next_active_state->ims = ims; \
|
||||
next_active_state++; \
|
||||
DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
|
||||
} \
|
||||
@@ -336,7 +354,6 @@ for the current character, one for the following character). */
|
||||
{ \
|
||||
next_active_state->offset = (x); \
|
||||
next_active_state->count = (y); \
|
||||
next_active_state->ims = ims; \
|
||||
next_active_state->data = (z); \
|
||||
next_active_state++; \
|
||||
DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
|
||||
@@ -348,7 +365,6 @@ for the current character, one for the following character). */
|
||||
{ \
|
||||
next_new_state->offset = (x); \
|
||||
next_new_state->count = (y); \
|
||||
next_new_state->ims = ims; \
|
||||
next_new_state++; \
|
||||
DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
|
||||
} \
|
||||
@@ -359,7 +375,6 @@ for the current character, one for the following character). */
|
||||
{ \
|
||||
next_new_state->offset = (x); \
|
||||
next_new_state->count = (y); \
|
||||
next_new_state->ims = ims; \
|
||||
next_new_state->data = (z); \
|
||||
next_new_state++; \
|
||||
DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
|
||||
@@ -378,9 +393,7 @@ internal_dfa_exec(
|
||||
int offsetcount,
|
||||
int *workspace,
|
||||
int wscount,
|
||||
int ims,
|
||||
int rlevel,
|
||||
int recursing)
|
||||
int rlevel)
|
||||
{
|
||||
stateblock *active_states, *new_states, *temp_states;
|
||||
stateblock *next_active_state, *next_new_state;
|
||||
@@ -389,6 +402,8 @@ const uschar *ctypes, *lcc, *fcc;
|
||||
const uschar *ptr;
|
||||
const uschar *end_code, *first_op;
|
||||
|
||||
dfa_recursion_info new_recursive;
|
||||
|
||||
int active_count, new_count, match_count;
|
||||
|
||||
/* Some fields in the md block are frequently referenced, so we load them into
|
||||
@@ -412,8 +427,8 @@ wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
|
||||
(2 * INTS_PER_STATEBLOCK);
|
||||
|
||||
DPRINTF(("\n%.*s---------------------\n"
|
||||
"%.*sCall to internal_dfa_exec f=%d r=%d\n",
|
||||
rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
|
||||
"%.*sCall to internal_dfa_exec f=%d\n",
|
||||
rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
|
||||
|
||||
ctypes = md->tables + ctypes_offset;
|
||||
lcc = md->tables + lcc_offset;
|
||||
@@ -426,7 +441,8 @@ next_new_state = new_states = active_states + wscount;
|
||||
new_count = 0;
|
||||
|
||||
first_op = this_start_code + 1 + LINK_SIZE +
|
||||
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
|
||||
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
|
||||
*this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
|
||||
|
||||
/* The first thing in any (sub) pattern is a bracket of some sort. Push all
|
||||
the alternative states onto the list, and find out where the end is. This
|
||||
@@ -525,7 +541,9 @@ else
|
||||
else
|
||||
{
|
||||
int length = 1 + LINK_SIZE +
|
||||
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
|
||||
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
|
||||
*this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
|
||||
2:0);
|
||||
do
|
||||
{
|
||||
ADD_NEW((int)(end_code - start_code + length), 0);
|
||||
@@ -605,6 +623,7 @@ for (;;)
|
||||
for (i = 0; i < active_count; i++)
|
||||
{
|
||||
stateblock *current_state = active_states + i;
|
||||
BOOL caseless = FALSE;
|
||||
const uschar *code;
|
||||
int state_offset = current_state->offset;
|
||||
int count, codevalue, rrc;
|
||||
@@ -616,10 +635,6 @@ for (;;)
|
||||
else printf("0x%02x\n", c);
|
||||
#endif
|
||||
|
||||
/* This variable is referred to implicity in the ADD_xxx macros. */
|
||||
|
||||
ims = current_state->ims;
|
||||
|
||||
/* A negative offset is a special case meaning "hold off going to this
|
||||
(negated) state until the number of characters in the data field have
|
||||
been skipped". */
|
||||
@@ -725,7 +740,12 @@ for (;;)
|
||||
|
||||
/* ========================================================================== */
|
||||
/* Reached a closing bracket. If not at the end of the pattern, carry
|
||||
on with the next opcode. Otherwise, unless we have an empty string and
|
||||
on with the next opcode. For repeating opcodes, also add the repeat
|
||||
state. Note that KETRPOS will always be encountered at the end of the
|
||||
subpattern, because the possessive subpattern repeats are always handled
|
||||
using recursive calls. Thus, it never adds any new states.
|
||||
|
||||
At the end of the (sub)pattern, unless we have an empty string and
|
||||
PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
|
||||
start of the subject, save the match data, shifting up all previous
|
||||
matches so we always have the longest first. */
|
||||
@@ -733,6 +753,7 @@ for (;;)
|
||||
case OP_KET:
|
||||
case OP_KETRMIN:
|
||||
case OP_KETRMAX:
|
||||
case OP_KETRPOS:
|
||||
if (code != end_code)
|
||||
{
|
||||
ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
|
||||
@@ -749,7 +770,7 @@ for (;;)
|
||||
current_subject > start_subject + md->start_offset)))
|
||||
{
|
||||
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
|
||||
else if (match_count > 0 && ++match_count * 2 >= offsetcount)
|
||||
else if (match_count > 0 && ++match_count * 2 > offsetcount)
|
||||
match_count = 0;
|
||||
count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
|
||||
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
|
||||
@@ -822,10 +843,14 @@ for (;;)
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_CIRC:
|
||||
if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
|
||||
{ ADD_ACTIVE(state_offset + 1, 0); }
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_CIRCM:
|
||||
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
|
||||
((ims & PCRE_MULTILINE) != 0 &&
|
||||
ptr != end_subject &&
|
||||
WAS_NEWLINE(ptr)))
|
||||
(ptr != end_subject && WAS_NEWLINE(ptr)))
|
||||
{ ADD_ACTIVE(state_offset + 1, 0); }
|
||||
break;
|
||||
|
||||
@@ -839,12 +864,6 @@ for (;;)
|
||||
}
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_OPT:
|
||||
ims = code[1];
|
||||
ADD_ACTIVE(state_offset + 2, 0);
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_SOD:
|
||||
if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
|
||||
@@ -890,11 +909,23 @@ for (;;)
|
||||
could_continue = TRUE;
|
||||
else if (clen == 0 ||
|
||||
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
|
||||
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
|
||||
(ptr == end_subject - md->nllen)
|
||||
))
|
||||
{ ADD_ACTIVE(state_offset + 1, 0); }
|
||||
}
|
||||
else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_DOLLM:
|
||||
if ((md->moptions & PCRE_NOTEOL) == 0)
|
||||
{
|
||||
if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
|
||||
could_continue = TRUE;
|
||||
else if (clen == 0 ||
|
||||
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
|
||||
{ ADD_ACTIVE(state_offset + 1, 0); }
|
||||
}
|
||||
else if (IS_NEWLINE(ptr))
|
||||
{ ADD_ACTIVE(state_offset + 1, 0); }
|
||||
break;
|
||||
|
||||
@@ -1950,7 +1981,7 @@ for (;;)
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_CHARNC:
|
||||
case OP_CHARI:
|
||||
if (clen == 0) break;
|
||||
|
||||
#ifdef SUPPORT_UTF8
|
||||
@@ -2136,19 +2167,35 @@ for (;;)
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
/* Match a negated single character. This is only used for one-byte
|
||||
characters, that is, we know that d < 256. The character we are
|
||||
/* Match a negated single character casefully. This is only used for
|
||||
one-byte characters, that is, we know that d < 256. The character we are
|
||||
checking (c) can be multibyte. */
|
||||
|
||||
case OP_NOT:
|
||||
if (clen > 0)
|
||||
{
|
||||
unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
|
||||
if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
|
||||
}
|
||||
if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
/* Match a negated single character caselessly. This is only used for
|
||||
one-byte characters, that is, we know that d < 256. The character we are
|
||||
checking (c) can be multibyte. */
|
||||
|
||||
case OP_NOTI:
|
||||
if (clen > 0 && c != d && c != fcc[d])
|
||||
{ ADD_NEW(state_offset + dlen + 1, 0); }
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_PLUSI:
|
||||
case OP_MINPLUSI:
|
||||
case OP_POSPLUSI:
|
||||
case OP_NOTPLUSI:
|
||||
case OP_NOTMINPLUSI:
|
||||
case OP_NOTPOSPLUSI:
|
||||
caseless = TRUE;
|
||||
codevalue -= OP_STARI - OP_STAR;
|
||||
|
||||
/* Fall through */
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
case OP_POSPLUS:
|
||||
@@ -2160,7 +2207,7 @@ for (;;)
|
||||
if (clen > 0)
|
||||
{
|
||||
unsigned int otherd = NOTACHAR;
|
||||
if ((ims & PCRE_CASELESS) != 0)
|
||||
if (caseless)
|
||||
{
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && d >= 128)
|
||||
@@ -2188,6 +2235,15 @@ for (;;)
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_QUERYI:
|
||||
case OP_MINQUERYI:
|
||||
case OP_POSQUERYI:
|
||||
case OP_NOTQUERYI:
|
||||
case OP_NOTMINQUERYI:
|
||||
case OP_NOTPOSQUERYI:
|
||||
caseless = TRUE;
|
||||
codevalue -= OP_STARI - OP_STAR;
|
||||
/* Fall through */
|
||||
case OP_QUERY:
|
||||
case OP_MINQUERY:
|
||||
case OP_POSQUERY:
|
||||
@@ -2198,7 +2254,7 @@ for (;;)
|
||||
if (clen > 0)
|
||||
{
|
||||
unsigned int otherd = NOTACHAR;
|
||||
if ((ims & PCRE_CASELESS) != 0)
|
||||
if (caseless)
|
||||
{
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && d >= 128)
|
||||
@@ -2224,6 +2280,15 @@ for (;;)
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_STARI:
|
||||
case OP_MINSTARI:
|
||||
case OP_POSSTARI:
|
||||
case OP_NOTSTARI:
|
||||
case OP_NOTMINSTARI:
|
||||
case OP_NOTPOSSTARI:
|
||||
caseless = TRUE;
|
||||
codevalue -= OP_STARI - OP_STAR;
|
||||
/* Fall through */
|
||||
case OP_STAR:
|
||||
case OP_MINSTAR:
|
||||
case OP_POSSTAR:
|
||||
@@ -2234,7 +2299,7 @@ for (;;)
|
||||
if (clen > 0)
|
||||
{
|
||||
unsigned int otherd = NOTACHAR;
|
||||
if ((ims & PCRE_CASELESS) != 0)
|
||||
if (caseless)
|
||||
{
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && d >= 128)
|
||||
@@ -2260,13 +2325,18 @@ for (;;)
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_EXACTI:
|
||||
case OP_NOTEXACTI:
|
||||
caseless = TRUE;
|
||||
codevalue -= OP_STARI - OP_STAR;
|
||||
/* Fall through */
|
||||
case OP_EXACT:
|
||||
case OP_NOTEXACT:
|
||||
count = current_state->count; /* Number already matched */
|
||||
if (clen > 0)
|
||||
{
|
||||
unsigned int otherd = NOTACHAR;
|
||||
if ((ims & PCRE_CASELESS) != 0)
|
||||
if (caseless)
|
||||
{
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && d >= 128)
|
||||
@@ -2290,6 +2360,15 @@ for (;;)
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_UPTOI:
|
||||
case OP_MINUPTOI:
|
||||
case OP_POSUPTOI:
|
||||
case OP_NOTUPTOI:
|
||||
case OP_NOTMINUPTOI:
|
||||
case OP_NOTPOSUPTOI:
|
||||
caseless = TRUE;
|
||||
codevalue -= OP_STARI - OP_STAR;
|
||||
/* Fall through */
|
||||
case OP_UPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_POSUPTO:
|
||||
@@ -2301,7 +2380,7 @@ for (;;)
|
||||
if (clen > 0)
|
||||
{
|
||||
unsigned int otherd = NOTACHAR;
|
||||
if ((ims & PCRE_CASELESS) != 0)
|
||||
if (caseless)
|
||||
{
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && d >= 128)
|
||||
@@ -2444,9 +2523,7 @@ for (;;)
|
||||
sizeof(local_offsets)/sizeof(int), /* size of same */
|
||||
local_workspace, /* workspace vector */
|
||||
sizeof(local_workspace)/sizeof(int), /* size of same */
|
||||
ims, /* the current ims flags */
|
||||
rlevel, /* function recursion level */
|
||||
recursing); /* pass on regex recursion */
|
||||
rlevel); /* function recursion level */
|
||||
|
||||
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
|
||||
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
|
||||
@@ -2485,6 +2562,7 @@ for (;;)
|
||||
cb.capture_top = 1;
|
||||
cb.capture_last = -1;
|
||||
cb.callout_data = md->callout_data;
|
||||
cb.mark = NULL; /* No (*MARK) support */
|
||||
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
|
||||
}
|
||||
if (rrc > 0) break; /* Fail this thread */
|
||||
@@ -2511,7 +2589,7 @@ for (;;)
|
||||
{
|
||||
int value = GET2(code, LINK_SIZE+2);
|
||||
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
|
||||
if (recursing > 0)
|
||||
if (md->recursive != NULL)
|
||||
{ ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
|
||||
else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
|
||||
}
|
||||
@@ -2535,9 +2613,7 @@ for (;;)
|
||||
sizeof(local_offsets)/sizeof(int), /* size of same */
|
||||
local_workspace, /* workspace vector */
|
||||
sizeof(local_workspace)/sizeof(int), /* size of same */
|
||||
ims, /* the current ims flags */
|
||||
rlevel, /* function recursion level */
|
||||
recursing); /* pass on regex recursion */
|
||||
rlevel); /* function recursion level */
|
||||
|
||||
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
|
||||
if ((rc >= 0) ==
|
||||
@@ -2552,28 +2628,47 @@ for (;;)
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_RECURSE:
|
||||
{
|
||||
dfa_recursion_info *ri;
|
||||
int local_offsets[1000];
|
||||
int local_workspace[1000];
|
||||
const uschar *callpat = start_code + GET(code, 1);
|
||||
int recno = (callpat == md->start_code)? 0 :
|
||||
GET2(callpat, 1 + LINK_SIZE);
|
||||
int rc;
|
||||
|
||||
DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
|
||||
recursing + 1));
|
||||
DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
|
||||
|
||||
/* Check for repeating a recursion without advancing the subject
|
||||
pointer. This should catch convoluted mutual recursions. (Some simple
|
||||
cases are caught at compile time.) */
|
||||
|
||||
for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
|
||||
if (recno == ri->group_num && ptr == ri->subject_position)
|
||||
return PCRE_ERROR_RECURSELOOP;
|
||||
|
||||
/* Remember this recursion and where we started it so as to
|
||||
catch infinite loops. */
|
||||
|
||||
new_recursive.group_num = recno;
|
||||
new_recursive.subject_position = ptr;
|
||||
new_recursive.prevrec = md->recursive;
|
||||
md->recursive = &new_recursive;
|
||||
|
||||
rc = internal_dfa_exec(
|
||||
md, /* fixed match data */
|
||||
start_code + GET(code, 1), /* this subexpression's code */
|
||||
callpat, /* this subexpression's code */
|
||||
ptr, /* where we currently are */
|
||||
(int)(ptr - start_subject), /* start offset */
|
||||
local_offsets, /* offset vector */
|
||||
sizeof(local_offsets)/sizeof(int), /* size of same */
|
||||
local_workspace, /* workspace vector */
|
||||
sizeof(local_workspace)/sizeof(int), /* size of same */
|
||||
ims, /* the current ims flags */
|
||||
rlevel, /* function recursion level */
|
||||
recursing + 1); /* regex recurse level */
|
||||
rlevel); /* function recursion level */
|
||||
|
||||
DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
|
||||
recursing + 1, rc));
|
||||
md->recursive = new_recursive.prevrec; /* Done this recursion */
|
||||
|
||||
DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
|
||||
rc));
|
||||
|
||||
/* Ran out of internal offsets */
|
||||
|
||||
@@ -2605,8 +2700,98 @@ for (;;)
|
||||
}
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_BRAPOS:
|
||||
case OP_SBRAPOS:
|
||||
case OP_CBRAPOS:
|
||||
case OP_SCBRAPOS:
|
||||
case OP_BRAPOSZERO:
|
||||
{
|
||||
int charcount, matched_count;
|
||||
const uschar *local_ptr = ptr;
|
||||
BOOL allow_zero;
|
||||
|
||||
if (codevalue == OP_BRAPOSZERO)
|
||||
{
|
||||
allow_zero = TRUE;
|
||||
codevalue = *(++code); /* Codevalue will be one of above BRAs */
|
||||
}
|
||||
else allow_zero = FALSE;
|
||||
|
||||
/* Loop to match the subpattern as many times as possible as if it were
|
||||
a complete pattern. */
|
||||
|
||||
for (matched_count = 0;; matched_count++)
|
||||
{
|
||||
int local_offsets[2];
|
||||
int local_workspace[1000];
|
||||
|
||||
int rc = internal_dfa_exec(
|
||||
md, /* fixed match data */
|
||||
code, /* this subexpression's code */
|
||||
local_ptr, /* where we currently are */
|
||||
(int)(ptr - start_subject), /* start offset */
|
||||
local_offsets, /* offset vector */
|
||||
sizeof(local_offsets)/sizeof(int), /* size of same */
|
||||
local_workspace, /* workspace vector */
|
||||
sizeof(local_workspace)/sizeof(int), /* size of same */
|
||||
rlevel); /* function recursion level */
|
||||
|
||||
/* Failed to match */
|
||||
|
||||
if (rc < 0)
|
||||
{
|
||||
if (rc != PCRE_ERROR_NOMATCH) return rc;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Matched: break the loop if zero characters matched. */
|
||||
|
||||
charcount = local_offsets[1] - local_offsets[0];
|
||||
if (charcount == 0) break;
|
||||
local_ptr += charcount; /* Advance temporary position ptr */
|
||||
}
|
||||
|
||||
/* At this point we have matched the subpattern matched_count
|
||||
times, and local_ptr is pointing to the character after the end of the
|
||||
last match. */
|
||||
|
||||
if (matched_count > 0 || allow_zero)
|
||||
{
|
||||
const uschar *end_subpattern = code;
|
||||
int next_state_offset;
|
||||
|
||||
do { end_subpattern += GET(end_subpattern, 1); }
|
||||
while (*end_subpattern == OP_ALT);
|
||||
next_state_offset =
|
||||
(int)(end_subpattern - start_code + LINK_SIZE + 1);
|
||||
|
||||
/* Optimization: if there are no more active states, and there
|
||||
are no new states yet set up, then skip over the subject string
|
||||
right here, to save looping. Otherwise, set up the new state to swing
|
||||
into action when the end of the matched substring is reached. */
|
||||
|
||||
if (i + 1 >= active_count && new_count == 0)
|
||||
{
|
||||
ptr = local_ptr;
|
||||
clen = 0;
|
||||
ADD_NEW(next_state_offset, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
const uschar *p = ptr;
|
||||
const uschar *pp = local_ptr;
|
||||
charcount = (int)(pp - p);
|
||||
while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
|
||||
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_ONCE:
|
||||
case OP_ONCE_NC:
|
||||
{
|
||||
int local_offsets[2];
|
||||
int local_workspace[1000];
|
||||
@@ -2620,9 +2805,7 @@ for (;;)
|
||||
sizeof(local_offsets)/sizeof(int), /* size of same */
|
||||
local_workspace, /* workspace vector */
|
||||
sizeof(local_workspace)/sizeof(int), /* size of same */
|
||||
ims, /* the current ims flags */
|
||||
rlevel, /* function recursion level */
|
||||
recursing); /* pass on regex recursion */
|
||||
rlevel); /* function recursion level */
|
||||
|
||||
if (rc >= 0)
|
||||
{
|
||||
@@ -2656,7 +2839,7 @@ for (;;)
|
||||
/* Optimization: if there are no more active states, and there
|
||||
are no new states yet set up, then skip over the subject string
|
||||
right here, to save looping. Otherwise, set up the new state to swing
|
||||
into action when the end of the substring is reached. */
|
||||
into action when the end of the matched substring is reached. */
|
||||
|
||||
else if (i + 1 >= active_count && new_count == 0)
|
||||
{
|
||||
@@ -2686,7 +2869,6 @@ for (;;)
|
||||
if (repeat_state_offset >= 0)
|
||||
{ ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
|
||||
}
|
||||
|
||||
}
|
||||
else if (rc != PCRE_ERROR_NOMATCH) return rc;
|
||||
}
|
||||
@@ -2713,6 +2895,7 @@ for (;;)
|
||||
cb.capture_top = 1;
|
||||
cb.capture_last = -1;
|
||||
cb.callout_data = md->callout_data;
|
||||
cb.mark = NULL; /* No (*MARK) support */
|
||||
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
|
||||
}
|
||||
if (rrc == 0)
|
||||
@@ -2963,15 +3146,21 @@ back the character offset. */
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
|
||||
{
|
||||
int tb;
|
||||
if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0)
|
||||
return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)?
|
||||
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
|
||||
if (start_offset > 0 && start_offset < length)
|
||||
int erroroffset;
|
||||
int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
|
||||
if (errorcode != 0)
|
||||
{
|
||||
tb = ((USPTR)subject)[start_offset] & 0xc0;
|
||||
if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
|
||||
if (offsetcount >= 2)
|
||||
{
|
||||
offsets[0] = erroroffset;
|
||||
offsets[1] = errorcode;
|
||||
}
|
||||
return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
|
||||
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
|
||||
}
|
||||
if (start_offset > 0 && start_offset < length &&
|
||||
(((USPTR)subject)[start_offset] & 0xc0) == 0x80)
|
||||
return PCRE_ERROR_BADUTF8_OFFSET;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -3141,7 +3330,7 @@ for (;;)
|
||||
disabling is explicitly requested (and of course, by the test above, this
|
||||
code is not obeyed when restarting after a partial match). */
|
||||
|
||||
if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
|
||||
if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
|
||||
(options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
|
||||
{
|
||||
/* If the pattern was studied, a minimum subject length may be set. This
|
||||
@@ -3209,6 +3398,7 @@ for (;;)
|
||||
/* OK, now we can do the business */
|
||||
|
||||
md->start_used_ptr = current_subject;
|
||||
md->recursive = NULL;
|
||||
|
||||
rc = internal_dfa_exec(
|
||||
md, /* fixed match data */
|
||||
@@ -3219,9 +3409,7 @@ for (;;)
|
||||
offsetcount, /* size of same */
|
||||
workspace, /* workspace vector */
|
||||
wscount, /* size of same */
|
||||
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
|
||||
0, /* function recurse level */
|
||||
0); /* regex recurse level */
|
||||
0); /* function recurse level */
|
||||
|
||||
/* Anything other than "no match" means we are done, always; otherwise, carry
|
||||
on only if not anchored. */
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
Copyright (c) 1997-2011 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -100,6 +100,19 @@ switch (what)
|
||||
*((size_t *)where) = (study == NULL)? 0 : study->size;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_JITSIZE:
|
||||
#ifdef SUPPORT_JIT
|
||||
*((size_t *)where) =
|
||||
(extra_data != NULL &&
|
||||
(extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
|
||||
extra_data->executable_jit != NULL)?
|
||||
_pcre_jit_get_size(extra_data->executable_jit) : 0;
|
||||
#else
|
||||
*((size_t *)where) = 0;
|
||||
#endif
|
||||
|
||||
break;
|
||||
|
||||
case PCRE_INFO_CAPTURECOUNT:
|
||||
*((int *)where) = re->top_bracket;
|
||||
break;
|
||||
@@ -129,6 +142,12 @@ switch (what)
|
||||
(int)study->minlength : -1;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_JIT:
|
||||
*((int *)where) = extra_data != NULL &&
|
||||
(extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
|
||||
extra_data->executable_jit != NULL;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_LASTLITERAL:
|
||||
*((int *)where) =
|
||||
((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1;
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
Copyright (c) 1997-2011 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -582,10 +582,6 @@ Standard C system should have one. */
|
||||
#endif
|
||||
|
||||
|
||||
/* These are the public options that can change during matching. */
|
||||
|
||||
#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
|
||||
|
||||
/* Private flags containing information about the compiled regex. They used to
|
||||
live at the top end of the options word, but that got almost full, so now they
|
||||
are in a 16-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as
|
||||
@@ -598,11 +594,12 @@ compatibility. */
|
||||
#define PCRE_STARTLINE 0x0008 /* start after \n for multiline */
|
||||
#define PCRE_JCHANGED 0x0010 /* j option used in regex */
|
||||
#define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */
|
||||
#define PCRE_HASTHEN 0x0040 /* pattern contains (*THEN) */
|
||||
|
||||
/* Options for the "extra" block produced by pcre_study(). */
|
||||
/* Flags for the "extra" block produced by pcre_study(). */
|
||||
|
||||
#define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
|
||||
#define PCRE_STUDY_MINLEN 0x02 /* a minimum length field exists */
|
||||
#define PCRE_STUDY_MAPPED 0x0001 /* a map of starting chars exists */
|
||||
#define PCRE_STUDY_MINLEN 0x0002 /* a minimum length field exists */
|
||||
|
||||
/* Masks for identifying the public options that are permitted at compile
|
||||
time, run time, or study time, respectively. */
|
||||
@@ -628,7 +625,8 @@ time, run time, or study time, respectively. */
|
||||
PCRE_DFA_RESTART|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
|
||||
PCRE_NO_START_OPTIMIZE)
|
||||
|
||||
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
|
||||
#define PUBLIC_STUDY_OPTIONS \
|
||||
PCRE_STUDY_JIT_COMPILE
|
||||
|
||||
/* Magic number to provide a small check against being handed junk. Also used
|
||||
to detect whether a pattern was compiled on a host of different endianness. */
|
||||
@@ -1254,8 +1252,8 @@ value such as \n. They must have non-zero values, as check_escape() returns
|
||||
their negation. Also, they must appear in the same order as in the opcode
|
||||
definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
|
||||
corresponds to "." in DOTALL mode rather than an escape sequence. It is also
|
||||
used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
|
||||
like \N.
|
||||
used for [^] in JavaScript compatibility mode, and for \C in non-utf8 mode. In
|
||||
non-DOTALL mode, "." behaves like \N.
|
||||
|
||||
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
|
||||
when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
|
||||
@@ -1299,6 +1297,7 @@ enum {
|
||||
OP_WHITESPACE, /* 9 \s */
|
||||
OP_NOT_WORDCHAR, /* 10 \W */
|
||||
OP_WORDCHAR, /* 11 \w */
|
||||
|
||||
OP_ANY, /* 12 Match any character except newline */
|
||||
OP_ALLANY, /* 13 Match any character */
|
||||
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
|
||||
@@ -1313,141 +1312,205 @@ enum {
|
||||
OP_EODN, /* 23 End of data or \n at end of data: \Z. */
|
||||
OP_EOD, /* 24 End of data: \z */
|
||||
|
||||
OP_OPT, /* 25 Set runtime options */
|
||||
OP_CIRC, /* 26 Start of line - varies with multiline switch */
|
||||
OP_DOLL, /* 27 End of line - varies with multiline switch */
|
||||
OP_CHAR, /* 28 Match one character, casefully */
|
||||
OP_CHARNC, /* 29 Match one character, caselessly */
|
||||
OP_NOT, /* 30 Match one character, not the following one */
|
||||
OP_CIRC, /* 25 Start of line - not multiline */
|
||||
OP_CIRCM, /* 26 Start of line - multiline */
|
||||
OP_DOLL, /* 27 End of line - not multiline */
|
||||
OP_DOLLM, /* 28 End of line - multiline */
|
||||
OP_CHAR, /* 29 Match one character, casefully */
|
||||
OP_CHARI, /* 30 Match one character, caselessly */
|
||||
OP_NOT, /* 31 Match one character, not the given one, casefully */
|
||||
OP_NOTI, /* 32 Match one character, not the given one, caselessly */
|
||||
|
||||
OP_STAR, /* 31 The maximizing and minimizing versions of */
|
||||
OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */
|
||||
OP_PLUS, /* 33 the minimizing one second. */
|
||||
OP_MINPLUS, /* 34 This first set applies to single characters.*/
|
||||
OP_QUERY, /* 35 */
|
||||
OP_MINQUERY, /* 36 */
|
||||
/* The following sets of 13 opcodes must always be kept in step because
|
||||
the offset from the first one is used to generate the others. */
|
||||
|
||||
OP_UPTO, /* 37 From 0 to n matches */
|
||||
OP_MINUPTO, /* 38 */
|
||||
OP_EXACT, /* 39 Exactly n matches */
|
||||
/**** Single characters, caseful, must precede the caseless ones ****/
|
||||
|
||||
OP_POSSTAR, /* 40 Possessified star */
|
||||
OP_POSPLUS, /* 41 Possessified plus */
|
||||
OP_POSQUERY, /* 42 Posesssified query */
|
||||
OP_POSUPTO, /* 43 Possessified upto */
|
||||
OP_STAR, /* 33 The maximizing and minimizing versions of */
|
||||
OP_MINSTAR, /* 34 these six opcodes must come in pairs, with */
|
||||
OP_PLUS, /* 35 the minimizing one second. */
|
||||
OP_MINPLUS, /* 36 */
|
||||
OP_QUERY, /* 37 */
|
||||
OP_MINQUERY, /* 38 */
|
||||
|
||||
OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */
|
||||
OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */
|
||||
OP_NOTPLUS, /* 46 the minimizing one second. They must be in */
|
||||
OP_NOTMINPLUS, /* 47 exactly the same order as those above. */
|
||||
OP_NOTQUERY, /* 48 This set applies to "not" single characters. */
|
||||
OP_NOTMINQUERY, /* 49 */
|
||||
OP_UPTO, /* 39 From 0 to n matches of one character, caseful*/
|
||||
OP_MINUPTO, /* 40 */
|
||||
OP_EXACT, /* 41 Exactly n matches */
|
||||
|
||||
OP_NOTUPTO, /* 50 From 0 to n matches */
|
||||
OP_NOTMINUPTO, /* 51 */
|
||||
OP_NOTEXACT, /* 52 Exactly n matches */
|
||||
OP_POSSTAR, /* 42 Possessified star, caseful */
|
||||
OP_POSPLUS, /* 43 Possessified plus, caseful */
|
||||
OP_POSQUERY, /* 44 Posesssified query, caseful */
|
||||
OP_POSUPTO, /* 45 Possessified upto, caseful */
|
||||
|
||||
OP_NOTPOSSTAR, /* 53 Possessified versions */
|
||||
OP_NOTPOSPLUS, /* 54 */
|
||||
OP_NOTPOSQUERY, /* 55 */
|
||||
OP_NOTPOSUPTO, /* 56 */
|
||||
/**** Single characters, caseless, must follow the caseful ones */
|
||||
|
||||
OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */
|
||||
OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */
|
||||
OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */
|
||||
OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */
|
||||
OP_TYPEQUERY, /* 61 This set applies to character types such as \d */
|
||||
OP_TYPEMINQUERY, /* 62 */
|
||||
OP_STARI, /* 46 */
|
||||
OP_MINSTARI, /* 47 */
|
||||
OP_PLUSI, /* 48 */
|
||||
OP_MINPLUSI, /* 49 */
|
||||
OP_QUERYI, /* 50 */
|
||||
OP_MINQUERYI, /* 51 */
|
||||
|
||||
OP_TYPEUPTO, /* 63 From 0 to n matches */
|
||||
OP_TYPEMINUPTO, /* 64 */
|
||||
OP_TYPEEXACT, /* 65 Exactly n matches */
|
||||
OP_UPTOI, /* 52 From 0 to n matches of one character, caseless */
|
||||
OP_MINUPTOI, /* 53 */
|
||||
OP_EXACTI, /* 54 */
|
||||
|
||||
OP_TYPEPOSSTAR, /* 66 Possessified versions */
|
||||
OP_TYPEPOSPLUS, /* 67 */
|
||||
OP_TYPEPOSQUERY, /* 68 */
|
||||
OP_TYPEPOSUPTO, /* 69 */
|
||||
OP_POSSTARI, /* 55 Possessified star, caseless */
|
||||
OP_POSPLUSI, /* 56 Possessified plus, caseless */
|
||||
OP_POSQUERYI, /* 57 Posesssified query, caseless */
|
||||
OP_POSUPTOI, /* 58 Possessified upto, caseless */
|
||||
|
||||
OP_CRSTAR, /* 70 The maximizing and minimizing versions of */
|
||||
OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */
|
||||
OP_CRPLUS, /* 72 the minimizing one second. These codes must */
|
||||
OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */
|
||||
OP_CRQUERY, /* 74 These are for character classes and back refs */
|
||||
OP_CRMINQUERY, /* 75 */
|
||||
OP_CRRANGE, /* 76 These are different to the three sets above. */
|
||||
OP_CRMINRANGE, /* 77 */
|
||||
/**** The negated ones must follow the non-negated ones, and match them ****/
|
||||
/**** Negated single character, caseful; must precede the caseless ones ****/
|
||||
|
||||
OP_CLASS, /* 78 Match a character class, chars < 256 only */
|
||||
OP_NCLASS, /* 79 Same, but the bitmap was created from a negative
|
||||
class - the difference is relevant only when a UTF-8
|
||||
character > 255 is encountered. */
|
||||
OP_NOTSTAR, /* 59 The maximizing and minimizing versions of */
|
||||
OP_NOTMINSTAR, /* 60 these six opcodes must come in pairs, with */
|
||||
OP_NOTPLUS, /* 61 the minimizing one second. They must be in */
|
||||
OP_NOTMINPLUS, /* 62 exactly the same order as those above. */
|
||||
OP_NOTQUERY, /* 63 */
|
||||
OP_NOTMINQUERY, /* 64 */
|
||||
|
||||
OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the
|
||||
class. This does both positive and negative. */
|
||||
OP_NOTUPTO, /* 65 From 0 to n matches, caseful */
|
||||
OP_NOTMINUPTO, /* 66 */
|
||||
OP_NOTEXACT, /* 67 Exactly n matches */
|
||||
|
||||
OP_REF, /* 81 Match a back reference */
|
||||
OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */
|
||||
OP_CALLOUT, /* 83 Call out to external function if provided */
|
||||
OP_NOTPOSSTAR, /* 68 Possessified versions, caseful */
|
||||
OP_NOTPOSPLUS, /* 69 */
|
||||
OP_NOTPOSQUERY, /* 70 */
|
||||
OP_NOTPOSUPTO, /* 71 */
|
||||
|
||||
OP_ALT, /* 84 Start of alternation */
|
||||
OP_KET, /* 85 End of group that doesn't have an unbounded repeat */
|
||||
OP_KETRMAX, /* 86 These two must remain together and in this */
|
||||
OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */
|
||||
/**** Negated single character, caseless; must follow the caseful ones ****/
|
||||
|
||||
/* The assertions must come before BRA, CBRA, ONCE, and COND.*/
|
||||
OP_NOTSTARI, /* 72 */
|
||||
OP_NOTMINSTARI, /* 73 */
|
||||
OP_NOTPLUSI, /* 74 */
|
||||
OP_NOTMINPLUSI, /* 75 */
|
||||
OP_NOTQUERYI, /* 76 */
|
||||
OP_NOTMINQUERYI, /* 77 */
|
||||
|
||||
OP_ASSERT, /* 88 Positive lookahead */
|
||||
OP_ASSERT_NOT, /* 89 Negative lookahead */
|
||||
OP_ASSERTBACK, /* 90 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */
|
||||
OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */
|
||||
OP_NOTUPTOI, /* 78 From 0 to n matches, caseless */
|
||||
OP_NOTMINUPTOI, /* 79 */
|
||||
OP_NOTEXACTI, /* 80 Exactly n matches */
|
||||
|
||||
/* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
|
||||
as there's a test for >= ONCE for a subpattern that isn't an assertion. */
|
||||
OP_NOTPOSSTARI, /* 81 Possessified versions, caseless */
|
||||
OP_NOTPOSPLUSI, /* 82 */
|
||||
OP_NOTPOSQUERYI, /* 83 */
|
||||
OP_NOTPOSUPTOI, /* 84 */
|
||||
|
||||
OP_ONCE, /* 93 Atomic group */
|
||||
OP_BRA, /* 94 Start of non-capturing bracket */
|
||||
OP_CBRA, /* 95 Start of capturing bracket */
|
||||
OP_COND, /* 96 Conditional group */
|
||||
/**** Character types ****/
|
||||
|
||||
/* These three must follow the previous three, in the same order. There's a
|
||||
OP_TYPESTAR, /* 85 The maximizing and minimizing versions of */
|
||||
OP_TYPEMINSTAR, /* 86 these six opcodes must come in pairs, with */
|
||||
OP_TYPEPLUS, /* 87 the minimizing one second. These codes must */
|
||||
OP_TYPEMINPLUS, /* 88 be in exactly the same order as those above. */
|
||||
OP_TYPEQUERY, /* 89 */
|
||||
OP_TYPEMINQUERY, /* 90 */
|
||||
|
||||
OP_TYPEUPTO, /* 91 From 0 to n matches */
|
||||
OP_TYPEMINUPTO, /* 92 */
|
||||
OP_TYPEEXACT, /* 93 Exactly n matches */
|
||||
|
||||
OP_TYPEPOSSTAR, /* 94 Possessified versions */
|
||||
OP_TYPEPOSPLUS, /* 95 */
|
||||
OP_TYPEPOSQUERY, /* 96 */
|
||||
OP_TYPEPOSUPTO, /* 97 */
|
||||
|
||||
/* These are used for character classes and back references; only the
|
||||
first six are the same as the sets above. */
|
||||
|
||||
OP_CRSTAR, /* 98 The maximizing and minimizing versions of */
|
||||
OP_CRMINSTAR, /* 99 all these opcodes must come in pairs, with */
|
||||
OP_CRPLUS, /* 100 the minimizing one second. These codes must */
|
||||
OP_CRMINPLUS, /* 101 be in exactly the same order as those above. */
|
||||
OP_CRQUERY, /* 102 */
|
||||
OP_CRMINQUERY, /* 103 */
|
||||
|
||||
OP_CRRANGE, /* 104 These are different to the three sets above. */
|
||||
OP_CRMINRANGE, /* 105 */
|
||||
|
||||
/* End of quantifier opcodes */
|
||||
|
||||
OP_CLASS, /* 106 Match a character class, chars < 256 only */
|
||||
OP_NCLASS, /* 107 Same, but the bitmap was created from a negative
|
||||
class - the difference is relevant only when a
|
||||
UTF-8 character > 255 is encountered. */
|
||||
OP_XCLASS, /* 108 Extended class for handling UTF-8 chars within the
|
||||
class. This does both positive and negative. */
|
||||
OP_REF, /* 109 Match a back reference, casefully */
|
||||
OP_REFI, /* 110 Match a back reference, caselessly */
|
||||
OP_RECURSE, /* 111 Match a numbered subpattern (possibly recursive) */
|
||||
OP_CALLOUT, /* 112 Call out to external function if provided */
|
||||
|
||||
OP_ALT, /* 113 Start of alternation */
|
||||
OP_KET, /* 114 End of group that doesn't have an unbounded repeat */
|
||||
OP_KETRMAX, /* 115 These two must remain together and in this */
|
||||
OP_KETRMIN, /* 116 order. They are for groups the repeat for ever. */
|
||||
OP_KETRPOS, /* 117 Possessive unlimited repeat. */
|
||||
|
||||
/* The assertions must come before BRA, CBRA, ONCE, and COND, and the four
|
||||
asserts must remain in order. */
|
||||
|
||||
OP_REVERSE, /* 118 Move pointer back - used in lookbehind assertions */
|
||||
OP_ASSERT, /* 119 Positive lookahead */
|
||||
OP_ASSERT_NOT, /* 120 Negative lookahead */
|
||||
OP_ASSERTBACK, /* 121 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 122 Negative lookbehind */
|
||||
|
||||
/* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately
|
||||
after the assertions, with ONCE first, as there's a test for >= ONCE for a
|
||||
subpattern that isn't an assertion. The POS versions must immediately follow
|
||||
the non-POS versions in each case. */
|
||||
|
||||
OP_ONCE, /* 123 Atomic group, contains captures */
|
||||
OP_ONCE_NC, /* 124 Atomic group containing no captures */
|
||||
OP_BRA, /* 125 Start of non-capturing bracket */
|
||||
OP_BRAPOS, /* 126 Ditto, with unlimited, possessive repeat */
|
||||
OP_CBRA, /* 127 Start of capturing bracket */
|
||||
OP_CBRAPOS, /* 128 Ditto, with unlimited, possessive repeat */
|
||||
OP_COND, /* 129 Conditional group */
|
||||
|
||||
/* These five must follow the previous five, in the same order. There's a
|
||||
check for >= SBRA to distinguish the two sets. */
|
||||
|
||||
OP_SBRA, /* 97 Start of non-capturing bracket, check empty */
|
||||
OP_SCBRA, /* 98 Start of capturing bracket, check empty */
|
||||
OP_SCOND, /* 99 Conditional group, check empty */
|
||||
OP_SBRA, /* 130 Start of non-capturing bracket, check empty */
|
||||
OP_SBRAPOS, /* 131 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCBRA, /* 132 Start of capturing bracket, check empty */
|
||||
OP_SCBRAPOS, /* 133 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCOND, /* 134 Conditional group, check empty */
|
||||
|
||||
/* The next two pairs must (respectively) be kept together. */
|
||||
|
||||
OP_CREF, /* 100 Used to hold a capture number as condition */
|
||||
OP_NCREF, /* 101 Same, but generaged by a name reference*/
|
||||
OP_RREF, /* 102 Used to hold a recursion number as condition */
|
||||
OP_NRREF, /* 103 Same, but generaged by a name reference*/
|
||||
OP_DEF, /* 104 The DEFINE condition */
|
||||
OP_CREF, /* 135 Used to hold a capture number as condition */
|
||||
OP_NCREF, /* 136 Same, but generated by a name reference*/
|
||||
OP_RREF, /* 137 Used to hold a recursion number as condition */
|
||||
OP_NRREF, /* 138 Same, but generated by a name reference*/
|
||||
OP_DEF, /* 139 The DEFINE condition */
|
||||
|
||||
OP_BRAZERO, /* 105 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 106 order. */
|
||||
OP_BRAZERO, /* 140 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 141 order. */
|
||||
OP_BRAPOSZERO, /* 142 */
|
||||
|
||||
/* These are backtracking control verbs */
|
||||
|
||||
OP_MARK, /* 107 always has an argument */
|
||||
OP_PRUNE, /* 108 */
|
||||
OP_PRUNE_ARG, /* 109 same, but with argument */
|
||||
OP_SKIP, /* 110 */
|
||||
OP_SKIP_ARG, /* 111 same, but with argument */
|
||||
OP_THEN, /* 112 */
|
||||
OP_THEN_ARG, /* 113 same, but with argument */
|
||||
OP_COMMIT, /* 114 */
|
||||
OP_MARK, /* 143 always has an argument */
|
||||
OP_PRUNE, /* 144 */
|
||||
OP_PRUNE_ARG, /* 145 same, but with argument */
|
||||
OP_SKIP, /* 146 */
|
||||
OP_SKIP_ARG, /* 147 same, but with argument */
|
||||
OP_THEN, /* 148 */
|
||||
OP_THEN_ARG, /* 149 same, but with argument */
|
||||
OP_COMMIT, /* 150 */
|
||||
|
||||
/* These are forced failure and success verbs */
|
||||
|
||||
OP_FAIL, /* 115 */
|
||||
OP_ACCEPT, /* 116 */
|
||||
OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */
|
||||
OP_FAIL, /* 151 */
|
||||
OP_ACCEPT, /* 152 */
|
||||
OP_ASSERT_ACCEPT, /* 153 Used inside assertions */
|
||||
OP_CLOSE, /* 154 Used before OP_ACCEPT to close open captures */
|
||||
|
||||
/* This is used to skip a subpattern with a {0} quantifier */
|
||||
|
||||
OP_SKIPZERO, /* 118 */
|
||||
OP_SKIPZERO, /* 155 */
|
||||
|
||||
/* This is not an opcode, but is used to check that tables indexed by opcode
|
||||
are the correct length, in order to catch updating errors - there have been
|
||||
@@ -1462,29 +1525,45 @@ called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
|
||||
|
||||
|
||||
/* This macro defines textual names for all the opcodes. These are used only
|
||||
for debugging. The macro is referenced only in pcre_printint.c. */
|
||||
for debugging, and some of them are only partial names. The macro is referenced
|
||||
only in pcre_printint.c, which fills out the full names in many cases (and in
|
||||
some cases doesn't actually use these names at all). */
|
||||
|
||||
#define OP_NAME_LIST \
|
||||
"End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
|
||||
"\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
|
||||
"notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
|
||||
"extuni", "\\Z", "\\z", \
|
||||
"Opt", "^", "$", "char", "charnc", "not", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
||||
"^", "^", "$", "$", "char", "chari", "not", "noti", \
|
||||
"*", "*?", "+", "+?", "?", "??", \
|
||||
"{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", \
|
||||
"{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", \
|
||||
"{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", \
|
||||
"{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", \
|
||||
"class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
|
||||
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
|
||||
"AssertB", "AssertB not", "Reverse", \
|
||||
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
|
||||
"class", "nclass", "xclass", "Ref", "Refi", \
|
||||
"Recurse", "Callout", \
|
||||
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
|
||||
"Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \
|
||||
"Once", "Once_NC", \
|
||||
"Bra", "BraPos", "CBra", "CBraPos", \
|
||||
"Cond", \
|
||||
"SBra", "SBraPos", "SCBra", "SCBraPos", \
|
||||
"SCond", \
|
||||
"Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
|
||||
"Brazero", "Braminzero", \
|
||||
"Brazero", "Braminzero", "Braposzero", \
|
||||
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
|
||||
"*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
|
||||
"*THEN", "*THEN", "*COMMIT", "*FAIL", \
|
||||
"*ACCEPT", "*ASSERT_ACCEPT", \
|
||||
"Close", "Skip zero"
|
||||
|
||||
|
||||
@@ -1505,18 +1584,25 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
3, 3, /* \P, \p */ \
|
||||
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
|
||||
1, /* \X */ \
|
||||
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
|
||||
1, 1, 1, 1, 1, 1, /* \Z, \z, ^, ^M, $, $M */ \
|
||||
2, /* Char - the minimum length */ \
|
||||
2, /* Charnc - the minimum length */ \
|
||||
2, /* Chari - the minimum length */ \
|
||||
2, /* not */ \
|
||||
/* Positive single-char repeats ** These are */ \
|
||||
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
|
||||
4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
|
||||
2, /* noti */ \
|
||||
/* Positive single-char repeats ** These are */ \
|
||||
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
|
||||
4, 4, 4, /* upto, minupto, exact ** mode */ \
|
||||
2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \
|
||||
2, 2, 2, 2, 2, 2, /* *I, *?I, +I, +?I, ?I, ??I ** UTF-8 */ \
|
||||
4, 4, 4, /* upto I, minupto I, exact I */ \
|
||||
2, 2, 2, 4, /* *+I, ++I, ?+I, upto+I */ \
|
||||
/* Negative single-char repeats - only for chars < 256 */ \
|
||||
2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
|
||||
4, 4, 4, /* NOT upto, minupto, exact */ \
|
||||
2, 2, 2, 4, /* Possessive *, +, ?, upto */ \
|
||||
2, 2, 2, 4, /* Possessive NOT *, +, ?, upto */ \
|
||||
2, 2, 2, 2, 2, 2, /* NOT *I, *?I, +I, +?I, ?I, ??I */ \
|
||||
4, 4, 4, /* NOT upto I, minupto I, exact I */ \
|
||||
2, 2, 2, 4, /* Possessive NOT *I, +I, ?I, upto I */ \
|
||||
/* Positive type repeats */ \
|
||||
2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
|
||||
4, 4, 4, /* Type upto, minupto, exact */ \
|
||||
@@ -1528,33 +1614,40 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
33, /* NCLASS */ \
|
||||
0, /* XCLASS - variable length */ \
|
||||
3, /* REF */ \
|
||||
3, /* REFI */ \
|
||||
1+LINK_SIZE, /* RECURSE */ \
|
||||
2+2*LINK_SIZE, /* CALLOUT */ \
|
||||
1+LINK_SIZE, /* Alt */ \
|
||||
1+LINK_SIZE, /* Ket */ \
|
||||
1+LINK_SIZE, /* KetRmax */ \
|
||||
1+LINK_SIZE, /* KetRmin */ \
|
||||
1+LINK_SIZE, /* KetRpos */ \
|
||||
1+LINK_SIZE, /* Reverse */ \
|
||||
1+LINK_SIZE, /* Assert */ \
|
||||
1+LINK_SIZE, /* Assert not */ \
|
||||
1+LINK_SIZE, /* Assert behind */ \
|
||||
1+LINK_SIZE, /* Assert behind not */ \
|
||||
1+LINK_SIZE, /* Reverse */ \
|
||||
1+LINK_SIZE, /* ONCE */ \
|
||||
1+LINK_SIZE, /* ONCE_NC */ \
|
||||
1+LINK_SIZE, /* BRA */ \
|
||||
1+LINK_SIZE, /* BRAPOS */ \
|
||||
3+LINK_SIZE, /* CBRA */ \
|
||||
3+LINK_SIZE, /* CBRAPOS */ \
|
||||
1+LINK_SIZE, /* COND */ \
|
||||
1+LINK_SIZE, /* SBRA */ \
|
||||
1+LINK_SIZE, /* SBRAPOS */ \
|
||||
3+LINK_SIZE, /* SCBRA */ \
|
||||
3+LINK_SIZE, /* SCBRAPOS */ \
|
||||
1+LINK_SIZE, /* SCOND */ \
|
||||
3, 3, /* CREF, NCREF */ \
|
||||
3, 3, /* RREF, NRREF */ \
|
||||
1, /* DEF */ \
|
||||
1, 1, /* BRAZERO, BRAMINZERO */ \
|
||||
1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \
|
||||
3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
|
||||
1, 3, /* SKIP, SKIP_ARG */ \
|
||||
1+LINK_SIZE, 3+LINK_SIZE, /* THEN, THEN_ARG */ \
|
||||
1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
|
||||
1, 3, /* THEN, THEN_ARG */ \
|
||||
1, 1, 1, 1, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ \
|
||||
3, 1 /* CLOSE, SKIPZERO */
|
||||
|
||||
/* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
|
||||
condition. */
|
||||
@@ -1571,8 +1664,8 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
|
||||
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
|
||||
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
|
||||
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
|
||||
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68,
|
||||
ERRCOUNT };
|
||||
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,
|
||||
ERR70, ERR71, ERR72, ERRCOUNT };
|
||||
|
||||
/* The real format of the start of the pcre block; the index of names and the
|
||||
code vector run on as long as necessary after the end. We store an explicit
|
||||
@@ -1648,10 +1741,12 @@ typedef struct compile_data {
|
||||
uschar *name_table; /* The name/number table */
|
||||
int names_found; /* Number of entries so far */
|
||||
int name_entry_size; /* Size of each entry */
|
||||
int workspace_size; /* Size of workspace */
|
||||
int bracount; /* Count of capturing parens as we compile */
|
||||
int final_bracount; /* Saved value after first pass */
|
||||
int top_backref; /* Maximum back reference */
|
||||
unsigned int backref_map; /* Bitmap of low back refs */
|
||||
int assert_depth; /* Depth of nested assertions */
|
||||
int external_options; /* External (initial) options */
|
||||
int external_flags; /* External flag bits to be set */
|
||||
int req_varyopt; /* "After variable item" flag for reqbyte */
|
||||
@@ -1663,7 +1758,7 @@ typedef struct compile_data {
|
||||
} compile_data;
|
||||
|
||||
/* Structure for maintaining a chain of pointers to the currently incomplete
|
||||
branches, for testing for left recursion. */
|
||||
branches, for testing for left recursion while compiling. */
|
||||
|
||||
typedef struct branch_chain {
|
||||
struct branch_chain *outer;
|
||||
@@ -1671,20 +1766,28 @@ typedef struct branch_chain {
|
||||
} branch_chain;
|
||||
|
||||
/* Structure for items in a linked list that represents an explicit recursive
|
||||
call within the pattern. */
|
||||
call within the pattern; used by pcre_exec(). */
|
||||
|
||||
typedef struct recursion_info {
|
||||
struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
|
||||
int group_num; /* Number of group that was called */
|
||||
const uschar *after_call; /* "Return value": points after the call in the expr */
|
||||
int *offset_save; /* Pointer to start of saved offsets */
|
||||
int saved_max; /* Number of saved offsets */
|
||||
int save_offset_top; /* Current value of offset_top */
|
||||
int group_num; /* Number of group that was called */
|
||||
int *offset_save; /* Pointer to start of saved offsets */
|
||||
int saved_max; /* Number of saved offsets */
|
||||
USPTR subject_position; /* Position at start of recursion */
|
||||
} recursion_info;
|
||||
|
||||
/* A similar structure for pcre_dfa_exec(). */
|
||||
|
||||
typedef struct dfa_recursion_info {
|
||||
struct dfa_recursion_info *prevrec;
|
||||
int group_num;
|
||||
USPTR subject_position;
|
||||
} dfa_recursion_info;
|
||||
|
||||
/* Structure for building a chain of data for holding the values of the subject
|
||||
pointer at the start of each subpattern, so as to detect when an empty string
|
||||
has been matched by a subpattern - to break infinite loops. */
|
||||
has been matched by a subpattern - to break infinite loops; used by
|
||||
pcre_exec(). */
|
||||
|
||||
typedef struct eptrblock {
|
||||
struct eptrblock *epb_prev;
|
||||
@@ -1708,8 +1811,8 @@ typedef struct match_data {
|
||||
int name_entry_size; /* Size of entry in names table */
|
||||
uschar *name_table; /* Table of names */
|
||||
uschar nl[4]; /* Newline string when fixed */
|
||||
const uschar *lcc; /* Points to lower casing table */
|
||||
const uschar *ctypes; /* Points to table of type maps */
|
||||
const uschar *lcc; /* Points to lower casing table */
|
||||
const uschar *ctypes; /* Points to table of type maps */
|
||||
BOOL offset_overflow; /* Set if too many extractions */
|
||||
BOOL notbol; /* NOTBOL flag */
|
||||
BOOL noteol; /* NOTEOL flag */
|
||||
@@ -1721,7 +1824,9 @@ typedef struct match_data {
|
||||
BOOL notempty_atstart; /* Empty string match at start not wanted */
|
||||
BOOL hitend; /* Hit the end of the subject at some point */
|
||||
BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */
|
||||
const uschar *start_code; /* For use when recursing */
|
||||
BOOL hasthen; /* Pattern contains (*THEN) */
|
||||
BOOL ignore_skip_arg; /* For re-run when SKIP name not found */
|
||||
const uschar *start_code; /* For use when recursing */
|
||||
USPTR start_subject; /* Start of the subject string */
|
||||
USPTR end_subject; /* End of the subject string */
|
||||
USPTR start_match_ptr; /* Start of matched string */
|
||||
@@ -1731,29 +1836,33 @@ typedef struct match_data {
|
||||
int end_offset_top; /* Highwater mark at end of match */
|
||||
int capture_last; /* Most recent capture number */
|
||||
int start_offset; /* The start offset value */
|
||||
int match_function_type; /* Set for certain special calls of MATCH() */
|
||||
eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */
|
||||
int eptrn; /* Next free eptrblock */
|
||||
recursion_info *recursive; /* Linked list of recursion data */
|
||||
void *callout_data; /* To pass back to callouts */
|
||||
const uschar *mark; /* Mark pointer to pass back */
|
||||
const uschar *mark; /* Mark pointer to pass back on success */
|
||||
const uschar *nomatch_mark; /* Mark pointer to pass back on failure */
|
||||
const uschar *once_target; /* Where to back up to for atomic groups */
|
||||
} match_data;
|
||||
|
||||
/* A similar structure is used for the same purpose by the DFA matching
|
||||
functions. */
|
||||
|
||||
typedef struct dfa_match_data {
|
||||
const uschar *start_code; /* Start of the compiled pattern */
|
||||
const uschar *start_subject; /* Start of the subject string */
|
||||
const uschar *end_subject; /* End of subject string */
|
||||
const uschar *start_used_ptr; /* Earliest consulted character */
|
||||
const uschar *tables; /* Character tables */
|
||||
int start_offset; /* The start offset value */
|
||||
int moptions; /* Match options */
|
||||
int poptions; /* Pattern options */
|
||||
int nltype; /* Newline type */
|
||||
int nllen; /* Newline string length */
|
||||
uschar nl[4]; /* Newline string when fixed */
|
||||
void *callout_data; /* To pass back to callouts */
|
||||
const uschar *start_code; /* Start of the compiled pattern */
|
||||
const uschar *start_subject; /* Start of the subject string */
|
||||
const uschar *end_subject; /* End of subject string */
|
||||
const uschar *start_used_ptr; /* Earliest consulted character */
|
||||
const uschar *tables; /* Character tables */
|
||||
int start_offset; /* The start offset value */
|
||||
int moptions; /* Match options */
|
||||
int poptions; /* Pattern options */
|
||||
int nltype; /* Newline type */
|
||||
int nllen; /* Newline string length */
|
||||
uschar nl[4]; /* Newline string when fixed */
|
||||
void *callout_data; /* To pass back to callouts */
|
||||
dfa_recursion_info *recursive; /* Linked list of recursion data */
|
||||
} dfa_match_data;
|
||||
|
||||
/* Bit definitions for entries in the pcre_ctypes table. */
|
||||
@@ -1811,6 +1920,10 @@ extern const int _pcre_utf8_table2[];
|
||||
extern const int _pcre_utf8_table3[];
|
||||
extern const uschar _pcre_utf8_table4[];
|
||||
|
||||
#ifdef SUPPORT_JIT
|
||||
extern const uschar _pcre_utf8_char_sizes[];
|
||||
#endif
|
||||
|
||||
extern const int _pcre_utf8_table1_size;
|
||||
|
||||
extern const char _pcre_utt_names[];
|
||||
@@ -1831,10 +1944,17 @@ extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
|
||||
extern int _pcre_ord2utf8(int, uschar *);
|
||||
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
|
||||
const pcre_study_data *, pcre_study_data *);
|
||||
extern int _pcre_valid_utf8(USPTR, int);
|
||||
extern int _pcre_valid_utf8(USPTR, int, int *);
|
||||
extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
|
||||
extern BOOL _pcre_xclass(int, const uschar *);
|
||||
|
||||
#ifdef SUPPORT_JIT
|
||||
extern void _pcre_jit_compile(const real_pcre *, pcre_extra *);
|
||||
extern int _pcre_jit_exec(const real_pcre *, void *, PCRE_SPTR,
|
||||
int, int, int, int, int *, int);
|
||||
extern void _pcre_jit_free(void *);
|
||||
extern int _pcre_jit_get_size(void *);
|
||||
#endif
|
||||
|
||||
/* Unicode character database (UCD) */
|
||||
|
||||
@@ -1848,14 +1968,16 @@ extern const ucd_record _pcre_ucd_records[];
|
||||
extern const uschar _pcre_ucd_stage1[];
|
||||
extern const pcre_uint16 _pcre_ucd_stage2[];
|
||||
extern const int _pcre_ucp_gentype[];
|
||||
|
||||
#ifdef SUPPORT_JIT
|
||||
extern const int _pcre_ucp_typerange[];
|
||||
#endif
|
||||
|
||||
/* UCD access macros */
|
||||
|
||||
#define UCD_BLOCK_SIZE 128
|
||||
#define GET_UCD(ch) (_pcre_ucd_records + \
|
||||
_pcre_ucd_stage2[_pcre_ucd_stage1[(ch) / UCD_BLOCK_SIZE] * \
|
||||
UCD_BLOCK_SIZE + ch % UCD_BLOCK_SIZE])
|
||||
UCD_BLOCK_SIZE + (ch) % UCD_BLOCK_SIZE])
|
||||
|
||||
#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
|
||||
#define UCD_SCRIPT(ch) GET_UCD(ch)->script
|
||||
|
||||
6605
harbour/src/3rd/pcre/pcrejitc.c
Normal file
6605
harbour/src/3rd/pcre/pcrejitc.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -180,6 +180,7 @@ utf8 = (options & PCRE_UTF8) != 0;
|
||||
for(;;)
|
||||
{
|
||||
uschar *ccode;
|
||||
const char *flag = " ";
|
||||
int c;
|
||||
int extra = 0;
|
||||
|
||||
@@ -214,10 +215,6 @@ for(;;)
|
||||
fprintf(f, "------------------------------------------------------------------\n");
|
||||
return;
|
||||
|
||||
case OP_OPT:
|
||||
fprintf(f, " %.2x %s", code[1], OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_CHAR:
|
||||
fprintf(f, " ");
|
||||
do
|
||||
@@ -229,28 +226,33 @@ for(;;)
|
||||
fprintf(f, "\n");
|
||||
continue;
|
||||
|
||||
case OP_CHARNC:
|
||||
fprintf(f, " NC ");
|
||||
case OP_CHARI:
|
||||
fprintf(f, " /i ");
|
||||
do
|
||||
{
|
||||
code++;
|
||||
code += 1 + print_char(f, code, utf8);
|
||||
}
|
||||
while (*code == OP_CHARNC);
|
||||
while (*code == OP_CHARI);
|
||||
fprintf(f, "\n");
|
||||
continue;
|
||||
|
||||
case OP_CBRA:
|
||||
case OP_CBRAPOS:
|
||||
case OP_SCBRA:
|
||||
case OP_SCBRAPOS:
|
||||
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
|
||||
else fprintf(f, " ");
|
||||
fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
|
||||
break;
|
||||
|
||||
case OP_BRA:
|
||||
case OP_BRAPOS:
|
||||
case OP_SBRA:
|
||||
case OP_SBRAPOS:
|
||||
case OP_KETRMAX:
|
||||
case OP_KETRMIN:
|
||||
case OP_KETRPOS:
|
||||
case OP_ALT:
|
||||
case OP_KET:
|
||||
case OP_ASSERT:
|
||||
@@ -258,6 +260,7 @@ for(;;)
|
||||
case OP_ASSERTBACK:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
case OP_ONCE:
|
||||
case OP_ONCE_NC:
|
||||
case OP_COND:
|
||||
case OP_SCOND:
|
||||
case OP_REVERSE:
|
||||
@@ -295,6 +298,17 @@ for(;;)
|
||||
fprintf(f, " Cond def");
|
||||
break;
|
||||
|
||||
case OP_STARI:
|
||||
case OP_MINSTARI:
|
||||
case OP_POSSTARI:
|
||||
case OP_PLUSI:
|
||||
case OP_MINPLUSI:
|
||||
case OP_POSPLUSI:
|
||||
case OP_QUERYI:
|
||||
case OP_MINQUERYI:
|
||||
case OP_POSQUERYI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_STAR:
|
||||
case OP_MINSTAR:
|
||||
case OP_POSSTAR:
|
||||
@@ -313,7 +327,7 @@ for(;;)
|
||||
case OP_TYPEQUERY:
|
||||
case OP_TYPEMINQUERY:
|
||||
case OP_TYPEPOSQUERY:
|
||||
fprintf(f, " ");
|
||||
fprintf(f, " %s ", flag);
|
||||
if (*code >= OP_TYPESTAR)
|
||||
{
|
||||
fprintf(f, "%s", OP_names[code[1]]);
|
||||
@@ -327,17 +341,23 @@ for(;;)
|
||||
fprintf(f, "%s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_EXACTI:
|
||||
case OP_UPTOI:
|
||||
case OP_MINUPTOI:
|
||||
case OP_POSUPTOI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_EXACT:
|
||||
case OP_UPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_POSUPTO:
|
||||
fprintf(f, " ");
|
||||
fprintf(f, " %s ", flag);
|
||||
extra = print_char(f, code+3, utf8);
|
||||
fprintf(f, "{");
|
||||
if (*code != OP_EXACT) fprintf(f, "0,");
|
||||
if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
|
||||
fprintf(f, "%d}", GET2(code,1));
|
||||
if (*code == OP_MINUPTO) fprintf(f, "?");
|
||||
else if (*code == OP_POSUPTO) fprintf(f, "+");
|
||||
if (*code == OP_MINUPTO || *code == OP_MINUPTOI) fprintf(f, "?");
|
||||
else if (*code == OP_POSUPTO || *code == OP_POSUPTOI) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_TYPEEXACT:
|
||||
@@ -357,12 +377,27 @@ for(;;)
|
||||
else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_NOTI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_NOT:
|
||||
c = code[1];
|
||||
if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
|
||||
else fprintf(f, " [^\\x%02x]", c);
|
||||
if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c);
|
||||
else fprintf(f, " %s [^\\x%02x]", flag, c);
|
||||
break;
|
||||
|
||||
case OP_NOTSTARI:
|
||||
case OP_NOTMINSTARI:
|
||||
case OP_NOTPOSSTARI:
|
||||
case OP_NOTPLUSI:
|
||||
case OP_NOTMINPLUSI:
|
||||
case OP_NOTPOSPLUSI:
|
||||
case OP_NOTQUERYI:
|
||||
case OP_NOTMINQUERYI:
|
||||
case OP_NOTPOSQUERYI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
|
||||
case OP_NOTSTAR:
|
||||
case OP_NOTMINSTAR:
|
||||
case OP_NOTPOSSTAR:
|
||||
@@ -373,22 +408,30 @@ for(;;)
|
||||
case OP_NOTMINQUERY:
|
||||
case OP_NOTPOSQUERY:
|
||||
c = code[1];
|
||||
if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
|
||||
else fprintf(f, " [^\\x%02x]", c);
|
||||
if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c);
|
||||
else fprintf(f, " %s [^\\x%02x]", flag, c);
|
||||
fprintf(f, "%s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_NOTEXACTI:
|
||||
case OP_NOTUPTOI:
|
||||
case OP_NOTMINUPTOI:
|
||||
case OP_NOTPOSUPTOI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
|
||||
case OP_NOTEXACT:
|
||||
case OP_NOTUPTO:
|
||||
case OP_NOTMINUPTO:
|
||||
case OP_NOTPOSUPTO:
|
||||
c = code[3];
|
||||
if (PRINTABLE(c)) fprintf(f, " [^%c]{", c);
|
||||
else fprintf(f, " [^\\x%02x]{", c);
|
||||
if (*code != OP_NOTEXACT) fprintf(f, "0,");
|
||||
if (PRINTABLE(c)) fprintf(f, " %s [^%c]{", flag, c);
|
||||
else fprintf(f, " %s [^\\x%02x]{", flag, c);
|
||||
if (*code != OP_NOTEXACT && *code != OP_NOTEXACTI) fprintf(f, "0,");
|
||||
fprintf(f, "%d}", GET2(code,1));
|
||||
if (*code == OP_NOTMINUPTO) fprintf(f, "?");
|
||||
else if (*code == OP_NOTPOSUPTO) fprintf(f, "+");
|
||||
if (*code == OP_NOTMINUPTO || *code == OP_NOTMINUPTOI) fprintf(f, "?");
|
||||
else
|
||||
if (*code == OP_NOTPOSUPTO || *code == OP_NOTPOSUPTOI) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_RECURSE:
|
||||
@@ -397,8 +440,11 @@ for(;;)
|
||||
fprintf(f, "%s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_REFI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_REF:
|
||||
fprintf(f, " \\%d", GET2(code,1));
|
||||
fprintf(f, " %s \\%d", flag, GET2(code,1));
|
||||
ccode = code + _pcre_OP_lengths[*code];
|
||||
goto CLASS_REF_REPEAT;
|
||||
|
||||
@@ -542,25 +588,23 @@ for(;;)
|
||||
break;
|
||||
|
||||
case OP_THEN:
|
||||
if (print_lengths)
|
||||
fprintf(f, " %s %d", OP_names[*code], GET(code, 1));
|
||||
else
|
||||
fprintf(f, " %s", OP_names[*code]);
|
||||
fprintf(f, " %s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_THEN_ARG:
|
||||
if (print_lengths)
|
||||
fprintf(f, " %s %d %s", OP_names[*code], GET(code, 1),
|
||||
code + 2 + LINK_SIZE);
|
||||
else
|
||||
fprintf(f, " %s %s", OP_names[*code], code + 2 + LINK_SIZE);
|
||||
extra += code[1+LINK_SIZE];
|
||||
fprintf(f, " %s %s", OP_names[*code], code + 2);
|
||||
extra += code[1];
|
||||
break;
|
||||
|
||||
/* Anything else is just an item with no data*/
|
||||
case OP_CIRCM:
|
||||
case OP_DOLLM:
|
||||
flag = "/m";
|
||||
/* Fall through */
|
||||
|
||||
/* Anything else is just an item with no data, but possibly a flag. */
|
||||
|
||||
default:
|
||||
fprintf(f, " %s", OP_names[*code]);
|
||||
fprintf(f, " %s %s", flag, OP_names[*code]);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
@@ -52,7 +52,7 @@ supporting functions. */
|
||||
|
||||
/* Returns from set_start_bits() */
|
||||
|
||||
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
|
||||
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
|
||||
|
||||
|
||||
|
||||
@@ -66,17 +66,20 @@ string of that length that matches. In UTF8 mode, the result is in characters
|
||||
rather than bytes.
|
||||
|
||||
Arguments:
|
||||
code pointer to start of group (the bracket)
|
||||
startcode pointer to start of the whole pattern
|
||||
options the compiling options
|
||||
code pointer to start of group (the bracket)
|
||||
startcode pointer to start of the whole pattern
|
||||
options the compiling options
|
||||
int RECURSE depth
|
||||
|
||||
Returns: the minimum length
|
||||
-1 if \C was encountered
|
||||
-1 if \C in UTF-8 mode or (*ACCEPT) was encountered
|
||||
-2 internal error (missing capturing bracket)
|
||||
-3 internal error (opcode not listed)
|
||||
*/
|
||||
|
||||
static int
|
||||
find_minlength(const uschar *code, const uschar *startcode, int options)
|
||||
find_minlength(const uschar *code, const uschar *startcode, int options,
|
||||
int recurse_depth)
|
||||
{
|
||||
int length = -1;
|
||||
BOOL utf8 = (options & PCRE_UTF8) != 0;
|
||||
@@ -84,7 +87,8 @@ BOOL had_recurse = FALSE;
|
||||
register int branchlength = 0;
|
||||
register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
|
||||
|
||||
if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
|
||||
if (*code == OP_CBRA || *code == OP_SCBRA ||
|
||||
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
|
||||
|
||||
/* Scan along the opcodes for this branch. If we get to the end of the
|
||||
branch, check the length against that of the other branches. */
|
||||
@@ -118,26 +122,40 @@ for (;;)
|
||||
case OP_SCBRA:
|
||||
case OP_BRA:
|
||||
case OP_SBRA:
|
||||
case OP_CBRAPOS:
|
||||
case OP_SCBRAPOS:
|
||||
case OP_BRAPOS:
|
||||
case OP_SBRAPOS:
|
||||
case OP_ONCE:
|
||||
d = find_minlength(cc, startcode, options);
|
||||
case OP_ONCE_NC:
|
||||
d = find_minlength(cc, startcode, options, recurse_depth);
|
||||
if (d < 0) return d;
|
||||
branchlength += d;
|
||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* ACCEPT makes things far too complicated; we have to give up. */
|
||||
|
||||
case OP_ACCEPT:
|
||||
case OP_ASSERT_ACCEPT:
|
||||
return -1;
|
||||
|
||||
/* Reached end of a branch; if it's a ket it is the end of a nested
|
||||
call. If it's ALT it is an alternation in a nested call. If it is
|
||||
END it's the end of the outer call. All can be handled by the same code. */
|
||||
call. If it's ALT it is an alternation in a nested call. If it is END it's
|
||||
the end of the outer call. All can be handled by the same code. If an
|
||||
ACCEPT was previously encountered, use the length that was in force at that
|
||||
time, and pass back the shortest ACCEPT length. */
|
||||
|
||||
case OP_ALT:
|
||||
case OP_KET:
|
||||
case OP_KETRMAX:
|
||||
case OP_KETRMIN:
|
||||
case OP_KETRPOS:
|
||||
case OP_END:
|
||||
if (length < 0 || (!had_recurse && branchlength < length))
|
||||
length = branchlength;
|
||||
if (*cc != OP_ALT) return length;
|
||||
if (op != OP_ALT) return length;
|
||||
cc += 1 + LINK_SIZE;
|
||||
branchlength = 0;
|
||||
had_recurse = FALSE;
|
||||
@@ -160,14 +178,15 @@ for (;;)
|
||||
case OP_RREF:
|
||||
case OP_NRREF:
|
||||
case OP_DEF:
|
||||
case OP_OPT:
|
||||
case OP_CALLOUT:
|
||||
case OP_SOD:
|
||||
case OP_SOM:
|
||||
case OP_EOD:
|
||||
case OP_EODN:
|
||||
case OP_CIRC:
|
||||
case OP_CIRCM:
|
||||
case OP_DOLL:
|
||||
case OP_DOLLM:
|
||||
case OP_NOT_WORD_BOUNDARY:
|
||||
case OP_WORD_BOUNDARY:
|
||||
cc += _pcre_OP_lengths[*cc];
|
||||
@@ -177,6 +196,7 @@ for (;;)
|
||||
|
||||
case OP_BRAZERO:
|
||||
case OP_BRAMINZERO:
|
||||
case OP_BRAPOSZERO:
|
||||
case OP_SKIPZERO:
|
||||
cc += _pcre_OP_lengths[*cc];
|
||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||
@@ -186,14 +206,21 @@ for (;;)
|
||||
/* Handle literal characters and + repetitions */
|
||||
|
||||
case OP_CHAR:
|
||||
case OP_CHARNC:
|
||||
case OP_CHARI:
|
||||
case OP_NOT:
|
||||
case OP_NOTI:
|
||||
case OP_PLUS:
|
||||
case OP_PLUSI:
|
||||
case OP_MINPLUS:
|
||||
case OP_MINPLUSI:
|
||||
case OP_POSPLUS:
|
||||
case OP_POSPLUSI:
|
||||
case OP_NOTPLUS:
|
||||
case OP_NOTPLUSI:
|
||||
case OP_NOTMINPLUS:
|
||||
case OP_NOTMINPLUSI:
|
||||
case OP_NOTPOSPLUS:
|
||||
case OP_NOTPOSPLUSI:
|
||||
branchlength++;
|
||||
cc += 2;
|
||||
#ifdef SUPPORT_UTF8
|
||||
@@ -212,7 +239,9 @@ for (;;)
|
||||
need to skip over a multibyte character in UTF8 mode. */
|
||||
|
||||
case OP_EXACT:
|
||||
case OP_EXACTI:
|
||||
case OP_NOTEXACT:
|
||||
case OP_NOTEXACTI:
|
||||
branchlength += GET2(cc,1);
|
||||
cc += 4;
|
||||
#ifdef SUPPORT_UTF8
|
||||
@@ -249,14 +278,17 @@ for (;;)
|
||||
cc++;
|
||||
break;
|
||||
|
||||
/* "Any newline" might match two characters */
|
||||
/* "Any newline" might match two characters, but it also might match just
|
||||
one. */
|
||||
|
||||
case OP_ANYNL:
|
||||
branchlength += 2;
|
||||
branchlength += 1;
|
||||
cc++;
|
||||
break;
|
||||
|
||||
/* The single-byte matcher means we can't proceed in UTF-8 mode */
|
||||
/* The single-byte matcher means we can't proceed in UTF-8 mode. (In
|
||||
non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
|
||||
appear, but leave the code, just in case.) */
|
||||
|
||||
case OP_ANYBYTE:
|
||||
#ifdef SUPPORT_UTF8
|
||||
@@ -337,6 +369,7 @@ for (;;)
|
||||
that case we must set the minimum length to zero. */
|
||||
|
||||
case OP_REF:
|
||||
case OP_REFI:
|
||||
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
|
||||
{
|
||||
ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
|
||||
@@ -347,7 +380,10 @@ for (;;)
|
||||
d = 0;
|
||||
had_recurse = TRUE;
|
||||
}
|
||||
else d = find_minlength(cs, startcode, options);
|
||||
else
|
||||
{
|
||||
d = find_minlength(cs, startcode, options, recurse_depth);
|
||||
}
|
||||
}
|
||||
else d = 0;
|
||||
cc += 3;
|
||||
@@ -364,6 +400,12 @@ for (;;)
|
||||
cc++;
|
||||
break;
|
||||
|
||||
case OP_CRPLUS:
|
||||
case OP_CRMINPLUS:
|
||||
min = 1;
|
||||
cc++;
|
||||
break;
|
||||
|
||||
case OP_CRRANGE:
|
||||
case OP_CRMINRANGE:
|
||||
min = GET2(cc, 1);
|
||||
@@ -378,36 +420,68 @@ for (;;)
|
||||
branchlength += min * d;
|
||||
break;
|
||||
|
||||
/* We can easily detect direct recursion, but not mutual recursion. This is
|
||||
caught by a recursion depth count. */
|
||||
|
||||
case OP_RECURSE:
|
||||
cs = ce = (uschar *)startcode + GET(cc, 1);
|
||||
if (cs == NULL) return -2;
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce)
|
||||
if ((cc > cs && cc < ce) || recurse_depth > 10)
|
||||
had_recurse = TRUE;
|
||||
else
|
||||
branchlength += find_minlength(cs, startcode, options);
|
||||
{
|
||||
branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
|
||||
}
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* Anything else does not or need not match a character. We can get the
|
||||
item's length from the table, but for those that can match zero occurrences
|
||||
of a character, we must take special action for UTF-8 characters. */
|
||||
of a character, we must take special action for UTF-8 characters. As it
|
||||
happens, the "NOT" versions of these opcodes are used at present only for
|
||||
ASCII characters, so they could be omitted from this list. However, in
|
||||
future that may change, so we include them here so as not to leave a
|
||||
gotcha for a future maintainer. */
|
||||
|
||||
case OP_UPTO:
|
||||
case OP_UPTOI:
|
||||
case OP_NOTUPTO:
|
||||
case OP_NOTUPTOI:
|
||||
case OP_MINUPTO:
|
||||
case OP_MINUPTOI:
|
||||
case OP_NOTMINUPTO:
|
||||
case OP_NOTMINUPTOI:
|
||||
case OP_POSUPTO:
|
||||
case OP_POSUPTOI:
|
||||
case OP_NOTPOSUPTO:
|
||||
case OP_NOTPOSUPTOI:
|
||||
|
||||
case OP_STAR:
|
||||
case OP_STARI:
|
||||
case OP_NOTSTAR:
|
||||
case OP_NOTSTARI:
|
||||
case OP_MINSTAR:
|
||||
case OP_MINSTARI:
|
||||
case OP_NOTMINSTAR:
|
||||
case OP_NOTMINSTARI:
|
||||
case OP_POSSTAR:
|
||||
case OP_POSSTARI:
|
||||
case OP_NOTPOSSTAR:
|
||||
case OP_NOTPOSSTARI:
|
||||
|
||||
case OP_QUERY:
|
||||
case OP_QUERYI:
|
||||
case OP_NOTQUERY:
|
||||
case OP_NOTQUERYI:
|
||||
case OP_MINQUERY:
|
||||
case OP_MINQUERYI:
|
||||
case OP_NOTMINQUERY:
|
||||
case OP_NOTMINQUERYI:
|
||||
case OP_POSQUERY:
|
||||
case OP_POSQUERYI:
|
||||
case OP_NOTPOSQUERY:
|
||||
case OP_NOTPOSQUERYI:
|
||||
|
||||
cc += _pcre_OP_lengths[op];
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
|
||||
@@ -419,20 +493,27 @@ for (;;)
|
||||
case OP_MARK:
|
||||
case OP_PRUNE_ARG:
|
||||
case OP_SKIP_ARG:
|
||||
case OP_THEN_ARG:
|
||||
cc += _pcre_OP_lengths[op] + cc[1];
|
||||
break;
|
||||
|
||||
case OP_THEN_ARG:
|
||||
cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
|
||||
break;
|
||||
/* The remaining opcodes are just skipped over. */
|
||||
|
||||
/* For the record, these are the opcodes that are matched by "default":
|
||||
OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
|
||||
OP_THEN. */
|
||||
|
||||
default:
|
||||
case OP_CLOSE:
|
||||
case OP_COMMIT:
|
||||
case OP_FAIL:
|
||||
case OP_PRUNE:
|
||||
case OP_SET_SOM:
|
||||
case OP_SKIP:
|
||||
case OP_THEN:
|
||||
cc += _pcre_OP_lengths[op];
|
||||
break;
|
||||
|
||||
/* This should not occur: we list all opcodes explicitly so that when
|
||||
new ones get added they are properly considered. */
|
||||
|
||||
default:
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
/* Control never gets here */
|
||||
@@ -578,18 +659,18 @@ function fails unless the result is SSB_DONE.
|
||||
Arguments:
|
||||
code points to an expression
|
||||
start_bits points to a 32-byte table, initialized to 0
|
||||
caseless the current state of the caseless flag
|
||||
utf8 TRUE if in UTF-8 mode
|
||||
cd the block with char table pointers
|
||||
|
||||
Returns: SSB_FAIL => Failed to find any starting bytes
|
||||
SSB_DONE => Found mandatory starting bytes
|
||||
SSB_CONTINUE => Found optional starting bytes
|
||||
SSB_UNKNOWN => Hit an unrecognized opcode
|
||||
*/
|
||||
|
||||
static int
|
||||
set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
|
||||
BOOL utf8, compile_data *cd)
|
||||
set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
|
||||
compile_data *cd)
|
||||
{
|
||||
register int c;
|
||||
int yield = SSB_DONE;
|
||||
@@ -614,19 +695,106 @@ volatile int dummy;
|
||||
|
||||
do
|
||||
{
|
||||
const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
|
||||
BOOL try_next = TRUE;
|
||||
const uschar *tcode = code + 1 + LINK_SIZE;
|
||||
|
||||
if (*code == OP_CBRA || *code == OP_SCBRA ||
|
||||
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
|
||||
|
||||
while (try_next) /* Loop for items in this branch */
|
||||
{
|
||||
int rc;
|
||||
|
||||
switch(*tcode)
|
||||
{
|
||||
/* Fail if we reach something we don't understand */
|
||||
/* If we reach something we don't understand, it means a new opcode has
|
||||
been created that hasn't been added to this code. Hopefully this problem
|
||||
will be discovered during testing. */
|
||||
|
||||
default:
|
||||
return SSB_UNKNOWN;
|
||||
|
||||
/* Fail for a valid opcode that implies no starting bits. */
|
||||
|
||||
case OP_ACCEPT:
|
||||
case OP_ASSERT_ACCEPT:
|
||||
case OP_ALLANY:
|
||||
case OP_ANY:
|
||||
case OP_ANYBYTE:
|
||||
case OP_CIRC:
|
||||
case OP_CIRCM:
|
||||
case OP_CLOSE:
|
||||
case OP_COMMIT:
|
||||
case OP_COND:
|
||||
case OP_CREF:
|
||||
case OP_DEF:
|
||||
case OP_DOLL:
|
||||
case OP_DOLLM:
|
||||
case OP_END:
|
||||
case OP_EOD:
|
||||
case OP_EODN:
|
||||
case OP_EXTUNI:
|
||||
case OP_FAIL:
|
||||
case OP_MARK:
|
||||
case OP_NCREF:
|
||||
case OP_NOT:
|
||||
case OP_NOTEXACT:
|
||||
case OP_NOTEXACTI:
|
||||
case OP_NOTI:
|
||||
case OP_NOTMINPLUS:
|
||||
case OP_NOTMINPLUSI:
|
||||
case OP_NOTMINQUERY:
|
||||
case OP_NOTMINQUERYI:
|
||||
case OP_NOTMINSTAR:
|
||||
case OP_NOTMINSTARI:
|
||||
case OP_NOTMINUPTO:
|
||||
case OP_NOTMINUPTOI:
|
||||
case OP_NOTPLUS:
|
||||
case OP_NOTPLUSI:
|
||||
case OP_NOTPOSPLUS:
|
||||
case OP_NOTPOSPLUSI:
|
||||
case OP_NOTPOSQUERY:
|
||||
case OP_NOTPOSQUERYI:
|
||||
case OP_NOTPOSSTAR:
|
||||
case OP_NOTPOSSTARI:
|
||||
case OP_NOTPOSUPTO:
|
||||
case OP_NOTPOSUPTOI:
|
||||
case OP_NOTPROP:
|
||||
case OP_NOTQUERY:
|
||||
case OP_NOTQUERYI:
|
||||
case OP_NOTSTAR:
|
||||
case OP_NOTSTARI:
|
||||
case OP_NOTUPTO:
|
||||
case OP_NOTUPTOI:
|
||||
case OP_NOT_HSPACE:
|
||||
case OP_NOT_VSPACE:
|
||||
case OP_NRREF:
|
||||
case OP_PROP:
|
||||
case OP_PRUNE:
|
||||
case OP_PRUNE_ARG:
|
||||
case OP_RECURSE:
|
||||
case OP_REF:
|
||||
case OP_REFI:
|
||||
case OP_REVERSE:
|
||||
case OP_RREF:
|
||||
case OP_SCOND:
|
||||
case OP_SET_SOM:
|
||||
case OP_SKIP:
|
||||
case OP_SKIP_ARG:
|
||||
case OP_SOD:
|
||||
case OP_SOM:
|
||||
case OP_THEN:
|
||||
case OP_THEN_ARG:
|
||||
case OP_XCLASS:
|
||||
return SSB_FAIL;
|
||||
|
||||
/* We can ignore word boundary tests. */
|
||||
|
||||
case OP_WORD_BOUNDARY:
|
||||
case OP_NOT_WORD_BOUNDARY:
|
||||
tcode++;
|
||||
break;
|
||||
|
||||
/* If we hit a bracket or a positive lookahead assertion, recurse to set
|
||||
bits from within the subpattern. If it can't find anything, we have to
|
||||
give up. If it finds some mandatory character(s), we are done for this
|
||||
@@ -636,10 +804,15 @@ do
|
||||
case OP_SBRA:
|
||||
case OP_CBRA:
|
||||
case OP_SCBRA:
|
||||
case OP_BRAPOS:
|
||||
case OP_SBRAPOS:
|
||||
case OP_CBRAPOS:
|
||||
case OP_SCBRAPOS:
|
||||
case OP_ONCE:
|
||||
case OP_ONCE_NC:
|
||||
case OP_ASSERT:
|
||||
rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
|
||||
if (rc == SSB_FAIL) return SSB_FAIL;
|
||||
rc = set_start_bits(tcode, start_bits, utf8, cd);
|
||||
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
|
||||
if (rc == SSB_DONE) try_next = FALSE; else
|
||||
{
|
||||
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
|
||||
@@ -662,6 +835,7 @@ do
|
||||
case OP_KET:
|
||||
case OP_KETRMAX:
|
||||
case OP_KETRMIN:
|
||||
case OP_KETRPOS:
|
||||
return SSB_CONTINUE;
|
||||
|
||||
/* Skip over callout */
|
||||
@@ -679,19 +853,13 @@ do
|
||||
tcode += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* Skip over an option setting, changing the caseless flag */
|
||||
|
||||
case OP_OPT:
|
||||
caseless = (tcode[1] & PCRE_CASELESS) != 0;
|
||||
tcode += 2;
|
||||
break;
|
||||
|
||||
/* BRAZERO does the bracket, but carries on. */
|
||||
|
||||
case OP_BRAZERO:
|
||||
case OP_BRAMINZERO:
|
||||
if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
|
||||
return SSB_FAIL;
|
||||
case OP_BRAPOSZERO:
|
||||
rc = set_start_bits(++tcode, start_bits, utf8, cd);
|
||||
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
|
||||
/* =========================================================================
|
||||
See the comment at the head of this function concerning the next line,
|
||||
which was an old fudge for the benefit of OS/2.
|
||||
@@ -717,7 +885,16 @@ do
|
||||
case OP_QUERY:
|
||||
case OP_MINQUERY:
|
||||
case OP_POSQUERY:
|
||||
tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
|
||||
tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
|
||||
break;
|
||||
|
||||
case OP_STARI:
|
||||
case OP_MINSTARI:
|
||||
case OP_POSSTARI:
|
||||
case OP_QUERYI:
|
||||
case OP_MINQUERYI:
|
||||
case OP_POSQUERYI:
|
||||
tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
|
||||
break;
|
||||
|
||||
/* Single-char upto sets the bit and tries the next */
|
||||
@@ -725,20 +902,36 @@ do
|
||||
case OP_UPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_POSUPTO:
|
||||
tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
|
||||
tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
|
||||
break;
|
||||
|
||||
case OP_UPTOI:
|
||||
case OP_MINUPTOI:
|
||||
case OP_POSUPTOI:
|
||||
tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
|
||||
break;
|
||||
|
||||
/* At least one single char sets the bit and stops */
|
||||
|
||||
case OP_EXACT: /* Fall through */
|
||||
case OP_EXACT:
|
||||
tcode += 2;
|
||||
|
||||
/* Fall through */
|
||||
case OP_CHAR:
|
||||
case OP_CHARNC:
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
case OP_POSPLUS:
|
||||
(void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
|
||||
(void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
case OP_EXACTI:
|
||||
tcode += 2;
|
||||
/* Fall through */
|
||||
case OP_CHARI:
|
||||
case OP_PLUSI:
|
||||
case OP_MINPLUSI:
|
||||
case OP_POSPLUSI:
|
||||
(void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
@@ -968,7 +1161,8 @@ do
|
||||
for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
|
||||
}
|
||||
|
||||
/* Advance past the bit map, and act on what follows */
|
||||
/* Advance past the bit map, and act on what follows. For a zero
|
||||
minimum repeat, continue; otherwise stop processing. */
|
||||
|
||||
tcode += 32;
|
||||
switch (*tcode)
|
||||
@@ -1004,6 +1198,8 @@ return yield;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Study a compiled expression *
|
||||
*************************************************/
|
||||
@@ -1029,7 +1225,7 @@ pcre_study(const pcre *external_re, int options, const char **errorptr)
|
||||
int min;
|
||||
BOOL bits_set = FALSE;
|
||||
uschar start_bits[32];
|
||||
pcre_extra *extra;
|
||||
pcre_extra *extra = NULL;
|
||||
pcre_study_data *study;
|
||||
const uschar *tables;
|
||||
uschar *code;
|
||||
@@ -1060,6 +1256,8 @@ seeking a list of starting bytes. */
|
||||
if ((re->options & PCRE_ANCHORED) == 0 &&
|
||||
(re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* Set the character tables in the block that is passed around */
|
||||
|
||||
tables = re->tables;
|
||||
@@ -1075,55 +1273,116 @@ if ((re->options & PCRE_ANCHORED) == 0 &&
|
||||
/* See if we can find a fixed set of initial characters for the pattern. */
|
||||
|
||||
memset(start_bits, 0, 32 * sizeof(uschar));
|
||||
bits_set = set_start_bits(code, start_bits,
|
||||
(re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
|
||||
&compile_block) == SSB_DONE;
|
||||
rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
|
||||
&compile_block);
|
||||
bits_set = rc == SSB_DONE;
|
||||
if (rc == SSB_UNKNOWN)
|
||||
{
|
||||
*errorptr = "internal error: opcode not recognized";
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* Find the minimum length of subject string. */
|
||||
|
||||
min = find_minlength(code, code, re->options);
|
||||
|
||||
/* Return NULL if no optimization is possible. */
|
||||
|
||||
if (!bits_set && min < 0) return NULL;
|
||||
|
||||
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
|
||||
the latter, which is pointed to by the former, which may also get additional
|
||||
data set later by the calling program. At the moment, the size of
|
||||
pcre_study_data is fixed. We nevertheless save it in a field for returning via
|
||||
the pcre_fullinfo() function so that if it becomes variable in the future, we
|
||||
don't have to change that code. */
|
||||
|
||||
extra = (pcre_extra *)(pcre_malloc)
|
||||
(sizeof(pcre_extra) + sizeof(pcre_study_data));
|
||||
|
||||
if (extra == NULL)
|
||||
switch(min = find_minlength(code, code, re->options, 0))
|
||||
{
|
||||
*errorptr = "failed to get memory";
|
||||
return NULL;
|
||||
case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
|
||||
case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
|
||||
default: break;
|
||||
}
|
||||
|
||||
study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
|
||||
extra->flags = PCRE_EXTRA_STUDY_DATA;
|
||||
extra->study_data = study;
|
||||
/* If a set of starting bytes has been identified, or if the minimum length is
|
||||
greater than zero, or if JIT optimization has been requested, get a pcre_extra
|
||||
block and a pcre_study_data block. The study data is put in the latter, which
|
||||
is pointed to by the former, which may also get additional data set later by
|
||||
the calling program. At the moment, the size of pcre_study_data is fixed. We
|
||||
nevertheless save it in a field for returning via the pcre_fullinfo() function
|
||||
so that if it becomes variable in the future, we don't have to change that
|
||||
code. */
|
||||
|
||||
study->size = sizeof(pcre_study_data);
|
||||
study->flags = 0;
|
||||
|
||||
if (bits_set)
|
||||
if (bits_set || min > 0
|
||||
#ifdef SUPPORT_JIT
|
||||
|| (options & PCRE_STUDY_JIT_COMPILE) != 0
|
||||
#endif
|
||||
)
|
||||
{
|
||||
study->flags |= PCRE_STUDY_MAPPED;
|
||||
memcpy(study->start_bits, start_bits, sizeof(start_bits));
|
||||
}
|
||||
extra = (pcre_extra *)(pcre_malloc)
|
||||
(sizeof(pcre_extra) + sizeof(pcre_study_data));
|
||||
if (extra == NULL)
|
||||
{
|
||||
*errorptr = "failed to get memory";
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (min >= 0)
|
||||
{
|
||||
study->flags |= PCRE_STUDY_MINLEN;
|
||||
study->minlength = min;
|
||||
study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
|
||||
extra->flags = PCRE_EXTRA_STUDY_DATA;
|
||||
extra->study_data = study;
|
||||
|
||||
study->size = sizeof(pcre_study_data);
|
||||
study->flags = 0;
|
||||
|
||||
/* Set the start bits always, to avoid unset memory errors if the
|
||||
study data is written to a file, but set the flag only if any of the bits
|
||||
are set, to save time looking when none are. */
|
||||
|
||||
if (bits_set)
|
||||
{
|
||||
study->flags |= PCRE_STUDY_MAPPED;
|
||||
memcpy(study->start_bits, start_bits, sizeof(start_bits));
|
||||
}
|
||||
else memset(study->start_bits, 0, 32 * sizeof(uschar));
|
||||
|
||||
/* Always set the minlength value in the block, because the JIT compiler
|
||||
makes use of it. However, don't set the bit unless the length is greater than
|
||||
zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
|
||||
checking the zero case. */
|
||||
|
||||
if (min > 0)
|
||||
{
|
||||
study->flags |= PCRE_STUDY_MINLEN;
|
||||
study->minlength = min;
|
||||
}
|
||||
else study->minlength = 0;
|
||||
|
||||
/* If JIT support was compiled and requested, attempt the JIT compilation.
|
||||
If no starting bytes were found, and the minimum length is zero, and JIT
|
||||
compilation fails, abandon the extra block and return NULL. */
|
||||
|
||||
#ifdef SUPPORT_JIT
|
||||
extra->executable_jit = NULL;
|
||||
if ((options & PCRE_STUDY_JIT_COMPILE) != 0) _pcre_jit_compile(re, extra);
|
||||
if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
|
||||
{
|
||||
pcre_free_study(extra);
|
||||
extra = NULL;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
return extra;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Free the study data *
|
||||
*************************************************/
|
||||
|
||||
/* This function frees the memory that was obtained by pcre_study().
|
||||
|
||||
Argument: a pointer to the pcre_extra block
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
PCRE_EXP_DEFN void
|
||||
pcre_free_study(pcre_extra *extra)
|
||||
{
|
||||
#ifdef SUPPORT_JIT
|
||||
if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
|
||||
extra->executable_jit != NULL)
|
||||
_pcre_jit_free(extra->executable_jit);
|
||||
#endif
|
||||
pcre_free(extra);
|
||||
}
|
||||
|
||||
/* End of pcre_study.c */
|
||||
|
||||
@@ -87,6 +87,19 @@ const uschar _pcre_utf8_table4[] = {
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
|
||||
|
||||
#ifdef SUPPORT_JIT
|
||||
/* Full table of the number of extra bytes when the
|
||||
character code is greater or equal than 0xc0.
|
||||
See _pcre_utf8_table4 above. */
|
||||
|
||||
const uschar _pcre_utf8_char_sizes[] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,
|
||||
};
|
||||
#endif
|
||||
|
||||
/* Table to translate from particular type value to the general value. */
|
||||
|
||||
const int _pcre_ucp_gentype[] = {
|
||||
@@ -100,6 +113,21 @@ const int _pcre_ucp_gentype[] = {
|
||||
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
|
||||
};
|
||||
|
||||
#ifdef SUPPORT_JIT
|
||||
/* This table reverses _pcre_ucp_gentype. We can save the cost
|
||||
of a memory load. */
|
||||
|
||||
const int _pcre_ucp_typerange[] = {
|
||||
ucp_Cc, ucp_Cs,
|
||||
ucp_Ll, ucp_Lu,
|
||||
ucp_Mc, ucp_Mn,
|
||||
ucp_Nd, ucp_No,
|
||||
ucp_Pc, ucp_Ps,
|
||||
ucp_Sc, ucp_So,
|
||||
ucp_Zl, ucp_Zs,
|
||||
};
|
||||
#endif
|
||||
|
||||
/* The pcre_utt[] table below translates Unicode property names into type and
|
||||
code values. It is searched by binary chop, so must be in collating sequence of
|
||||
name. Originally, the table contained pointers to the name strings in the first
|
||||
@@ -110,7 +138,7 @@ table itself. Maintenance is more error-prone, but frequent changes to this
|
||||
data are unlikely.
|
||||
|
||||
July 2008: There is now a script called maint/GenerateUtt.py that can be used
|
||||
to generate this data instead of maintaining it entirely by hand.
|
||||
to generate this data automatically instead of maintaining it by hand.
|
||||
|
||||
The script was updated in March 2009 to generate a new EBCDIC-compliant
|
||||
version. Like all other character and string literals that are compared against
|
||||
@@ -123,8 +151,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
|
||||
#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
|
||||
#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
|
||||
#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0"
|
||||
#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
|
||||
#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
|
||||
#define STRING_Brahmi0 STR_B STR_r STR_a STR_h STR_m STR_i "\0"
|
||||
#define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
|
||||
#define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
|
||||
#define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
|
||||
@@ -186,6 +216,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
|
||||
#define STRING_M0 STR_M "\0"
|
||||
#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
|
||||
#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0"
|
||||
#define STRING_Mc0 STR_M STR_c "\0"
|
||||
#define STRING_Me0 STR_M STR_e "\0"
|
||||
#define STRING_Meetei_Mayek0 STR_M STR_e STR_e STR_t STR_e STR_i STR_UNDERSCORE STR_M STR_a STR_y STR_e STR_k "\0"
|
||||
@@ -260,8 +291,10 @@ const char _pcre_utt_names[] =
|
||||
STRING_Avestan0
|
||||
STRING_Balinese0
|
||||
STRING_Bamum0
|
||||
STRING_Batak0
|
||||
STRING_Bengali0
|
||||
STRING_Bopomofo0
|
||||
STRING_Brahmi0
|
||||
STRING_Braille0
|
||||
STRING_Buginese0
|
||||
STRING_Buhid0
|
||||
@@ -323,6 +356,7 @@ const char _pcre_utt_names[] =
|
||||
STRING_Lydian0
|
||||
STRING_M0
|
||||
STRING_Malayalam0
|
||||
STRING_Mandaic0
|
||||
STRING_Mc0
|
||||
STRING_Me0
|
||||
STRING_Meetei_Mayek0
|
||||
@@ -397,135 +431,138 @@ const ucp_type_table _pcre_utt[] = {
|
||||
{ 20, PT_SC, ucp_Avestan },
|
||||
{ 28, PT_SC, ucp_Balinese },
|
||||
{ 37, PT_SC, ucp_Bamum },
|
||||
{ 43, PT_SC, ucp_Bengali },
|
||||
{ 51, PT_SC, ucp_Bopomofo },
|
||||
{ 60, PT_SC, ucp_Braille },
|
||||
{ 68, PT_SC, ucp_Buginese },
|
||||
{ 77, PT_SC, ucp_Buhid },
|
||||
{ 83, PT_GC, ucp_C },
|
||||
{ 85, PT_SC, ucp_Canadian_Aboriginal },
|
||||
{ 105, PT_SC, ucp_Carian },
|
||||
{ 112, PT_PC, ucp_Cc },
|
||||
{ 115, PT_PC, ucp_Cf },
|
||||
{ 118, PT_SC, ucp_Cham },
|
||||
{ 123, PT_SC, ucp_Cherokee },
|
||||
{ 132, PT_PC, ucp_Cn },
|
||||
{ 135, PT_PC, ucp_Co },
|
||||
{ 138, PT_SC, ucp_Common },
|
||||
{ 145, PT_SC, ucp_Coptic },
|
||||
{ 152, PT_PC, ucp_Cs },
|
||||
{ 155, PT_SC, ucp_Cuneiform },
|
||||
{ 165, PT_SC, ucp_Cypriot },
|
||||
{ 173, PT_SC, ucp_Cyrillic },
|
||||
{ 182, PT_SC, ucp_Deseret },
|
||||
{ 190, PT_SC, ucp_Devanagari },
|
||||
{ 201, PT_SC, ucp_Egyptian_Hieroglyphs },
|
||||
{ 222, PT_SC, ucp_Ethiopic },
|
||||
{ 231, PT_SC, ucp_Georgian },
|
||||
{ 240, PT_SC, ucp_Glagolitic },
|
||||
{ 251, PT_SC, ucp_Gothic },
|
||||
{ 258, PT_SC, ucp_Greek },
|
||||
{ 264, PT_SC, ucp_Gujarati },
|
||||
{ 273, PT_SC, ucp_Gurmukhi },
|
||||
{ 282, PT_SC, ucp_Han },
|
||||
{ 286, PT_SC, ucp_Hangul },
|
||||
{ 293, PT_SC, ucp_Hanunoo },
|
||||
{ 301, PT_SC, ucp_Hebrew },
|
||||
{ 308, PT_SC, ucp_Hiragana },
|
||||
{ 317, PT_SC, ucp_Imperial_Aramaic },
|
||||
{ 334, PT_SC, ucp_Inherited },
|
||||
{ 344, PT_SC, ucp_Inscriptional_Pahlavi },
|
||||
{ 366, PT_SC, ucp_Inscriptional_Parthian },
|
||||
{ 389, PT_SC, ucp_Javanese },
|
||||
{ 398, PT_SC, ucp_Kaithi },
|
||||
{ 405, PT_SC, ucp_Kannada },
|
||||
{ 413, PT_SC, ucp_Katakana },
|
||||
{ 422, PT_SC, ucp_Kayah_Li },
|
||||
{ 431, PT_SC, ucp_Kharoshthi },
|
||||
{ 442, PT_SC, ucp_Khmer },
|
||||
{ 448, PT_GC, ucp_L },
|
||||
{ 450, PT_LAMP, 0 },
|
||||
{ 453, PT_SC, ucp_Lao },
|
||||
{ 457, PT_SC, ucp_Latin },
|
||||
{ 463, PT_SC, ucp_Lepcha },
|
||||
{ 470, PT_SC, ucp_Limbu },
|
||||
{ 476, PT_SC, ucp_Linear_B },
|
||||
{ 485, PT_SC, ucp_Lisu },
|
||||
{ 490, PT_PC, ucp_Ll },
|
||||
{ 493, PT_PC, ucp_Lm },
|
||||
{ 496, PT_PC, ucp_Lo },
|
||||
{ 499, PT_PC, ucp_Lt },
|
||||
{ 502, PT_PC, ucp_Lu },
|
||||
{ 505, PT_SC, ucp_Lycian },
|
||||
{ 512, PT_SC, ucp_Lydian },
|
||||
{ 519, PT_GC, ucp_M },
|
||||
{ 521, PT_SC, ucp_Malayalam },
|
||||
{ 531, PT_PC, ucp_Mc },
|
||||
{ 534, PT_PC, ucp_Me },
|
||||
{ 537, PT_SC, ucp_Meetei_Mayek },
|
||||
{ 550, PT_PC, ucp_Mn },
|
||||
{ 553, PT_SC, ucp_Mongolian },
|
||||
{ 563, PT_SC, ucp_Myanmar },
|
||||
{ 571, PT_GC, ucp_N },
|
||||
{ 573, PT_PC, ucp_Nd },
|
||||
{ 576, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 588, PT_SC, ucp_Nko },
|
||||
{ 592, PT_PC, ucp_Nl },
|
||||
{ 595, PT_PC, ucp_No },
|
||||
{ 598, PT_SC, ucp_Ogham },
|
||||
{ 604, PT_SC, ucp_Ol_Chiki },
|
||||
{ 613, PT_SC, ucp_Old_Italic },
|
||||
{ 624, PT_SC, ucp_Old_Persian },
|
||||
{ 636, PT_SC, ucp_Old_South_Arabian },
|
||||
{ 654, PT_SC, ucp_Old_Turkic },
|
||||
{ 665, PT_SC, ucp_Oriya },
|
||||
{ 671, PT_SC, ucp_Osmanya },
|
||||
{ 679, PT_GC, ucp_P },
|
||||
{ 681, PT_PC, ucp_Pc },
|
||||
{ 684, PT_PC, ucp_Pd },
|
||||
{ 687, PT_PC, ucp_Pe },
|
||||
{ 690, PT_PC, ucp_Pf },
|
||||
{ 693, PT_SC, ucp_Phags_Pa },
|
||||
{ 702, PT_SC, ucp_Phoenician },
|
||||
{ 713, PT_PC, ucp_Pi },
|
||||
{ 716, PT_PC, ucp_Po },
|
||||
{ 719, PT_PC, ucp_Ps },
|
||||
{ 722, PT_SC, ucp_Rejang },
|
||||
{ 729, PT_SC, ucp_Runic },
|
||||
{ 735, PT_GC, ucp_S },
|
||||
{ 737, PT_SC, ucp_Samaritan },
|
||||
{ 747, PT_SC, ucp_Saurashtra },
|
||||
{ 758, PT_PC, ucp_Sc },
|
||||
{ 761, PT_SC, ucp_Shavian },
|
||||
{ 769, PT_SC, ucp_Sinhala },
|
||||
{ 777, PT_PC, ucp_Sk },
|
||||
{ 780, PT_PC, ucp_Sm },
|
||||
{ 783, PT_PC, ucp_So },
|
||||
{ 786, PT_SC, ucp_Sundanese },
|
||||
{ 796, PT_SC, ucp_Syloti_Nagri },
|
||||
{ 809, PT_SC, ucp_Syriac },
|
||||
{ 816, PT_SC, ucp_Tagalog },
|
||||
{ 824, PT_SC, ucp_Tagbanwa },
|
||||
{ 833, PT_SC, ucp_Tai_Le },
|
||||
{ 840, PT_SC, ucp_Tai_Tham },
|
||||
{ 849, PT_SC, ucp_Tai_Viet },
|
||||
{ 858, PT_SC, ucp_Tamil },
|
||||
{ 864, PT_SC, ucp_Telugu },
|
||||
{ 871, PT_SC, ucp_Thaana },
|
||||
{ 878, PT_SC, ucp_Thai },
|
||||
{ 883, PT_SC, ucp_Tibetan },
|
||||
{ 891, PT_SC, ucp_Tifinagh },
|
||||
{ 900, PT_SC, ucp_Ugaritic },
|
||||
{ 909, PT_SC, ucp_Vai },
|
||||
{ 913, PT_ALNUM, 0 },
|
||||
{ 917, PT_PXSPACE, 0 },
|
||||
{ 921, PT_SPACE, 0 },
|
||||
{ 925, PT_WORD, 0 },
|
||||
{ 929, PT_SC, ucp_Yi },
|
||||
{ 932, PT_GC, ucp_Z },
|
||||
{ 934, PT_PC, ucp_Zl },
|
||||
{ 937, PT_PC, ucp_Zp },
|
||||
{ 940, PT_PC, ucp_Zs }
|
||||
{ 43, PT_SC, ucp_Batak },
|
||||
{ 49, PT_SC, ucp_Bengali },
|
||||
{ 57, PT_SC, ucp_Bopomofo },
|
||||
{ 66, PT_SC, ucp_Brahmi },
|
||||
{ 73, PT_SC, ucp_Braille },
|
||||
{ 81, PT_SC, ucp_Buginese },
|
||||
{ 90, PT_SC, ucp_Buhid },
|
||||
{ 96, PT_GC, ucp_C },
|
||||
{ 98, PT_SC, ucp_Canadian_Aboriginal },
|
||||
{ 118, PT_SC, ucp_Carian },
|
||||
{ 125, PT_PC, ucp_Cc },
|
||||
{ 128, PT_PC, ucp_Cf },
|
||||
{ 131, PT_SC, ucp_Cham },
|
||||
{ 136, PT_SC, ucp_Cherokee },
|
||||
{ 145, PT_PC, ucp_Cn },
|
||||
{ 148, PT_PC, ucp_Co },
|
||||
{ 151, PT_SC, ucp_Common },
|
||||
{ 158, PT_SC, ucp_Coptic },
|
||||
{ 165, PT_PC, ucp_Cs },
|
||||
{ 168, PT_SC, ucp_Cuneiform },
|
||||
{ 178, PT_SC, ucp_Cypriot },
|
||||
{ 186, PT_SC, ucp_Cyrillic },
|
||||
{ 195, PT_SC, ucp_Deseret },
|
||||
{ 203, PT_SC, ucp_Devanagari },
|
||||
{ 214, PT_SC, ucp_Egyptian_Hieroglyphs },
|
||||
{ 235, PT_SC, ucp_Ethiopic },
|
||||
{ 244, PT_SC, ucp_Georgian },
|
||||
{ 253, PT_SC, ucp_Glagolitic },
|
||||
{ 264, PT_SC, ucp_Gothic },
|
||||
{ 271, PT_SC, ucp_Greek },
|
||||
{ 277, PT_SC, ucp_Gujarati },
|
||||
{ 286, PT_SC, ucp_Gurmukhi },
|
||||
{ 295, PT_SC, ucp_Han },
|
||||
{ 299, PT_SC, ucp_Hangul },
|
||||
{ 306, PT_SC, ucp_Hanunoo },
|
||||
{ 314, PT_SC, ucp_Hebrew },
|
||||
{ 321, PT_SC, ucp_Hiragana },
|
||||
{ 330, PT_SC, ucp_Imperial_Aramaic },
|
||||
{ 347, PT_SC, ucp_Inherited },
|
||||
{ 357, PT_SC, ucp_Inscriptional_Pahlavi },
|
||||
{ 379, PT_SC, ucp_Inscriptional_Parthian },
|
||||
{ 402, PT_SC, ucp_Javanese },
|
||||
{ 411, PT_SC, ucp_Kaithi },
|
||||
{ 418, PT_SC, ucp_Kannada },
|
||||
{ 426, PT_SC, ucp_Katakana },
|
||||
{ 435, PT_SC, ucp_Kayah_Li },
|
||||
{ 444, PT_SC, ucp_Kharoshthi },
|
||||
{ 455, PT_SC, ucp_Khmer },
|
||||
{ 461, PT_GC, ucp_L },
|
||||
{ 463, PT_LAMP, 0 },
|
||||
{ 466, PT_SC, ucp_Lao },
|
||||
{ 470, PT_SC, ucp_Latin },
|
||||
{ 476, PT_SC, ucp_Lepcha },
|
||||
{ 483, PT_SC, ucp_Limbu },
|
||||
{ 489, PT_SC, ucp_Linear_B },
|
||||
{ 498, PT_SC, ucp_Lisu },
|
||||
{ 503, PT_PC, ucp_Ll },
|
||||
{ 506, PT_PC, ucp_Lm },
|
||||
{ 509, PT_PC, ucp_Lo },
|
||||
{ 512, PT_PC, ucp_Lt },
|
||||
{ 515, PT_PC, ucp_Lu },
|
||||
{ 518, PT_SC, ucp_Lycian },
|
||||
{ 525, PT_SC, ucp_Lydian },
|
||||
{ 532, PT_GC, ucp_M },
|
||||
{ 534, PT_SC, ucp_Malayalam },
|
||||
{ 544, PT_SC, ucp_Mandaic },
|
||||
{ 552, PT_PC, ucp_Mc },
|
||||
{ 555, PT_PC, ucp_Me },
|
||||
{ 558, PT_SC, ucp_Meetei_Mayek },
|
||||
{ 571, PT_PC, ucp_Mn },
|
||||
{ 574, PT_SC, ucp_Mongolian },
|
||||
{ 584, PT_SC, ucp_Myanmar },
|
||||
{ 592, PT_GC, ucp_N },
|
||||
{ 594, PT_PC, ucp_Nd },
|
||||
{ 597, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 609, PT_SC, ucp_Nko },
|
||||
{ 613, PT_PC, ucp_Nl },
|
||||
{ 616, PT_PC, ucp_No },
|
||||
{ 619, PT_SC, ucp_Ogham },
|
||||
{ 625, PT_SC, ucp_Ol_Chiki },
|
||||
{ 634, PT_SC, ucp_Old_Italic },
|
||||
{ 645, PT_SC, ucp_Old_Persian },
|
||||
{ 657, PT_SC, ucp_Old_South_Arabian },
|
||||
{ 675, PT_SC, ucp_Old_Turkic },
|
||||
{ 686, PT_SC, ucp_Oriya },
|
||||
{ 692, PT_SC, ucp_Osmanya },
|
||||
{ 700, PT_GC, ucp_P },
|
||||
{ 702, PT_PC, ucp_Pc },
|
||||
{ 705, PT_PC, ucp_Pd },
|
||||
{ 708, PT_PC, ucp_Pe },
|
||||
{ 711, PT_PC, ucp_Pf },
|
||||
{ 714, PT_SC, ucp_Phags_Pa },
|
||||
{ 723, PT_SC, ucp_Phoenician },
|
||||
{ 734, PT_PC, ucp_Pi },
|
||||
{ 737, PT_PC, ucp_Po },
|
||||
{ 740, PT_PC, ucp_Ps },
|
||||
{ 743, PT_SC, ucp_Rejang },
|
||||
{ 750, PT_SC, ucp_Runic },
|
||||
{ 756, PT_GC, ucp_S },
|
||||
{ 758, PT_SC, ucp_Samaritan },
|
||||
{ 768, PT_SC, ucp_Saurashtra },
|
||||
{ 779, PT_PC, ucp_Sc },
|
||||
{ 782, PT_SC, ucp_Shavian },
|
||||
{ 790, PT_SC, ucp_Sinhala },
|
||||
{ 798, PT_PC, ucp_Sk },
|
||||
{ 801, PT_PC, ucp_Sm },
|
||||
{ 804, PT_PC, ucp_So },
|
||||
{ 807, PT_SC, ucp_Sundanese },
|
||||
{ 817, PT_SC, ucp_Syloti_Nagri },
|
||||
{ 830, PT_SC, ucp_Syriac },
|
||||
{ 837, PT_SC, ucp_Tagalog },
|
||||
{ 845, PT_SC, ucp_Tagbanwa },
|
||||
{ 854, PT_SC, ucp_Tai_Le },
|
||||
{ 861, PT_SC, ucp_Tai_Tham },
|
||||
{ 870, PT_SC, ucp_Tai_Viet },
|
||||
{ 879, PT_SC, ucp_Tamil },
|
||||
{ 885, PT_SC, ucp_Telugu },
|
||||
{ 892, PT_SC, ucp_Thaana },
|
||||
{ 899, PT_SC, ucp_Thai },
|
||||
{ 904, PT_SC, ucp_Tibetan },
|
||||
{ 912, PT_SC, ucp_Tifinagh },
|
||||
{ 921, PT_SC, ucp_Ugaritic },
|
||||
{ 930, PT_SC, ucp_Vai },
|
||||
{ 934, PT_ALNUM, 0 },
|
||||
{ 938, PT_PXSPACE, 0 },
|
||||
{ 942, PT_SPACE, 0 },
|
||||
{ 946, PT_WORD, 0 },
|
||||
{ 950, PT_SC, ucp_Yi },
|
||||
{ 953, PT_GC, ucp_Z },
|
||||
{ 955, PT_PC, ucp_Zl },
|
||||
{ 958, PT_PC, ucp_Zp },
|
||||
{ 961, PT_PC, ucp_Zs }
|
||||
};
|
||||
|
||||
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -54,42 +54,56 @@ strings. */
|
||||
*************************************************/
|
||||
|
||||
/* This function is called (optionally) at the start of compile or match, to
|
||||
validate that a supposed UTF-8 string is actually valid. The early check means
|
||||
check that a supposed UTF-8 string is actually valid. The early check means
|
||||
that subsequent code can assume it is dealing with a valid string. The check
|
||||
can be turned off for maximum performance, but the consequences of supplying
|
||||
an invalid string are then undefined.
|
||||
can be turned off for maximum performance, but the consequences of supplying an
|
||||
invalid string are then undefined.
|
||||
|
||||
Originally, this function checked according to RFC 2279, allowing for values in
|
||||
the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
|
||||
the canonical format. Once somebody had pointed out RFC 3629 to me (it
|
||||
obsoletes 2279), additional restrictions were applied. The values are now
|
||||
limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
|
||||
subrange 0xd000 to 0xdfff is excluded.
|
||||
subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
|
||||
characters is still checked.
|
||||
|
||||
From release 8.13 more information about the details of the error are passed
|
||||
back in the returned value:
|
||||
|
||||
PCRE_UTF8_ERR0 No error
|
||||
PCRE_UTF8_ERR1 Missing 1 byte at the end of the string
|
||||
PCRE_UTF8_ERR2 Missing 2 bytes at the end of the string
|
||||
PCRE_UTF8_ERR3 Missing 3 bytes at the end of the string
|
||||
PCRE_UTF8_ERR4 Missing 4 bytes at the end of the string
|
||||
PCRE_UTF8_ERR5 Missing 5 bytes at the end of the string
|
||||
PCRE_UTF8_ERR6 2nd-byte's two top bits are not 0x80
|
||||
PCRE_UTF8_ERR7 3rd-byte's two top bits are not 0x80
|
||||
PCRE_UTF8_ERR8 4th-byte's two top bits are not 0x80
|
||||
PCRE_UTF8_ERR9 5th-byte's two top bits are not 0x80
|
||||
PCRE_UTF8_ERR10 6th-byte's two top bits are not 0x80
|
||||
PCRE_UTF8_ERR11 5-byte character is not permitted by RFC 3629
|
||||
PCRE_UTF8_ERR12 6-byte character is not permitted by RFC 3629
|
||||
PCRE_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted
|
||||
PCRE_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted
|
||||
PCRE_UTF8_ERR15 Overlong 2-byte sequence
|
||||
PCRE_UTF8_ERR16 Overlong 3-byte sequence
|
||||
PCRE_UTF8_ERR17 Overlong 4-byte sequence
|
||||
PCRE_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur)
|
||||
PCRE_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur)
|
||||
PCRE_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character)
|
||||
PCRE_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
|
||||
|
||||
Arguments:
|
||||
string points to the string
|
||||
length length of string, or -1 if the string is zero-terminated
|
||||
errp pointer to an error position offset variable
|
||||
|
||||
Returns: < 0 if the string is a valid UTF-8 string
|
||||
>= 0 otherwise; the value is the offset of the bad byte
|
||||
|
||||
Bad bytes can be:
|
||||
|
||||
. An isolated byte whose most significant bits are 0x80, because this
|
||||
can only correctly appear within a UTF-8 character;
|
||||
|
||||
. A byte whose most significant bits are 0xc0, but whose other bits indicate
|
||||
that there are more than 3 additional bytes (i.e. an RFC 2279 starting
|
||||
byte, which is no longer valid under RFC 3629);
|
||||
|
||||
.
|
||||
|
||||
The returned offset may also be equal to the length of the string; this means
|
||||
that one or more bytes is missing from the final UTF-8 character.
|
||||
Returns: = 0 if the string is a valid UTF-8 string
|
||||
> 0 otherwise, setting the offset of the bad character
|
||||
*/
|
||||
|
||||
int
|
||||
_pcre_valid_utf8(USPTR string, int length)
|
||||
_pcre_valid_utf8(USPTR string, int length, int *erroroffset)
|
||||
{
|
||||
#ifdef SUPPORT_UTF8
|
||||
register USPTR p;
|
||||
@@ -97,84 +111,189 @@ register USPTR p;
|
||||
if (length < 0)
|
||||
{
|
||||
for (p = string; *p != 0; p++);
|
||||
length = p - string;
|
||||
length = (int)(p - string);
|
||||
}
|
||||
|
||||
for (p = string; length-- > 0; p++)
|
||||
{
|
||||
register int ab;
|
||||
register int c = *p;
|
||||
if (c < 128) continue;
|
||||
if (c < 0xc0) return p - string;
|
||||
register int ab, c, d;
|
||||
|
||||
c = *p;
|
||||
if (c < 128) continue; /* ASCII character */
|
||||
|
||||
if (c < 0xc0) /* Isolated 10xx xxxx byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string);
|
||||
return PCRE_UTF8_ERR20;
|
||||
}
|
||||
|
||||
if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
|
||||
{
|
||||
*erroroffset = (int)(p - string);
|
||||
return PCRE_UTF8_ERR21;
|
||||
}
|
||||
|
||||
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
|
||||
if (ab > 3) return p - string; /* Too many for RFC 3629 */
|
||||
if (length < ab) return p + 1 + length - string; /* Missing bytes */
|
||||
length -= ab;
|
||||
if (length < ab)
|
||||
{
|
||||
*erroroffset = (int)(p - string); /* Missing bytes */
|
||||
return ab - length; /* Codes ERR1 to ERR5 */
|
||||
}
|
||||
length -= ab; /* Length remaining */
|
||||
|
||||
/* Check top bits in the second byte */
|
||||
if ((*(++p) & 0xc0) != 0x80) return p - string;
|
||||
|
||||
/* Check for overlong sequences for each different length, and for the
|
||||
excluded range 0xd000 to 0xdfff. */
|
||||
if (((d = *(++p)) & 0xc0) != 0x80)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 1;
|
||||
return PCRE_UTF8_ERR6;
|
||||
}
|
||||
|
||||
/* For each length, check that the remaining bytes start with the 0x80 bit
|
||||
set and not the 0x40 bit. Then check for an overlong sequence, and for the
|
||||
excluded range 0xd800 to 0xdfff. */
|
||||
|
||||
switch (ab)
|
||||
{
|
||||
/* Check for xx00 000x (overlong sequence) */
|
||||
/* 2-byte character. No further bytes to check for 0x80. Check first byte
|
||||
for for xx00 000x (overlong sequence). */
|
||||
|
||||
case 1:
|
||||
if ((c & 0x3e) == 0) return p - string;
|
||||
continue; /* We know there aren't any more bytes to check */
|
||||
case 1: if ((c & 0x3e) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 1;
|
||||
return PCRE_UTF8_ERR15;
|
||||
}
|
||||
break;
|
||||
|
||||
/* Check for 1110 0000, xx0x xxxx (overlong sequence) or
|
||||
1110 1101, 1010 xxxx (0xd000 - 0xdfff) */
|
||||
/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
|
||||
for 1110 0000, xx0x xxxx (overlong sequence) or
|
||||
1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
|
||||
|
||||
case 2:
|
||||
if ((c == 0xe0 && (*p & 0x20) == 0) ||
|
||||
(c == 0xed && *p >= 0xa0))
|
||||
return p - string;
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR7;
|
||||
}
|
||||
if (c == 0xe0 && (d & 0x20) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR16;
|
||||
}
|
||||
if (c == 0xed && d >= 0xa0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR14;
|
||||
}
|
||||
break;
|
||||
|
||||
/* Check for 1111 0000, xx00 xxxx (overlong sequence) or
|
||||
greater than 0x0010ffff (f4 8f bf bf) */
|
||||
/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
|
||||
bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a
|
||||
character greater than 0x0010ffff (f4 8f bf bf) */
|
||||
|
||||
case 3:
|
||||
if ((c == 0xf0 && (*p & 0x30) == 0) ||
|
||||
(c > 0xf4 ) ||
|
||||
(c == 0xf4 && *p > 0x8f))
|
||||
return p - string;
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE_UTF8_ERR8;
|
||||
}
|
||||
if (c == 0xf0 && (d & 0x30) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE_UTF8_ERR17;
|
||||
}
|
||||
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE_UTF8_ERR13;
|
||||
}
|
||||
break;
|
||||
|
||||
#if 0
|
||||
/* These cases can no longer occur, as we restrict to a maximum of four
|
||||
bytes nowadays. Leave the code here in case we ever want to add an option
|
||||
for longer sequences. */
|
||||
/* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be
|
||||
rejected by the length test below. However, we do the appropriate tests
|
||||
here so that overlong sequences get diagnosed, and also in case there is
|
||||
ever an option for handling these larger code points. */
|
||||
|
||||
/* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for
|
||||
1111 1000, xx00 0xxx */
|
||||
|
||||
/* Check for 1111 1000, xx00 0xxx */
|
||||
case 4:
|
||||
if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE_UTF8_ERR8;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE_UTF8_ERR9;
|
||||
}
|
||||
if (c == 0xf8 && (d & 0x38) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE_UTF8_ERR18;
|
||||
}
|
||||
break;
|
||||
|
||||
/* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
|
||||
/* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for
|
||||
1111 1100, xx00 00xx. */
|
||||
|
||||
case 5:
|
||||
if (c == 0xfe || c == 0xff ||
|
||||
(c == 0xfc && (*p & 0x3c) == 0)) return p - string;
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE_UTF8_ERR8;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE_UTF8_ERR9;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 5;
|
||||
return PCRE_UTF8_ERR10;
|
||||
}
|
||||
if (c == 0xfc && (d & 0x3c) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 5;
|
||||
return PCRE_UTF8_ERR19;
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/* Check for valid bytes after the 2nd, if any; all must start 10 */
|
||||
while (--ab > 0)
|
||||
/* Character is valid under RFC 2279, but 4-byte and 5-byte characters are
|
||||
excluded by RFC 3629. The pointer p is currently at the last byte of the
|
||||
character. */
|
||||
|
||||
if (ab > 3)
|
||||
{
|
||||
if ((*(++p) & 0xc0) != 0x80) return p - string;
|
||||
*erroroffset = (int)(p - string) - ab;
|
||||
return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12;
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
#else /* SUPPORT_UTF8 */
|
||||
(void)(string); /* Keep picky compilers happy */
|
||||
(void)(length);
|
||||
#endif
|
||||
|
||||
return -1;
|
||||
return PCRE_UTF8_ERR0; /* This indicates success */
|
||||
}
|
||||
|
||||
/* End of pcre_valid_utf8.c */
|
||||
|
||||
@@ -153,7 +153,11 @@ enum {
|
||||
ucp_Old_Turkic,
|
||||
ucp_Samaritan,
|
||||
ucp_Tai_Tham,
|
||||
ucp_Tai_Viet
|
||||
ucp_Tai_Viet,
|
||||
/* New for Unicode 6.0.0: */
|
||||
ucp_Batak,
|
||||
ucp_Brahmi,
|
||||
ucp_Mandaic
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user