2012-02-07 22:25 UTC+0100 Viktor Szakats (harbour syenar.net)

* src/3rd/pcre/Makefile
  * src/3rd/pcre/*
  + src/3rd/pcre/pcrejitc.c
    * 8.12 -> 8.21 (using hb3rdpat)
    ; NOTE: New JIT feature off by default, not enabled
This commit is contained in:
Viktor Szakats
2012-02-07 21:27:36 +00:00
parent 76c2ac96db
commit 0164645180
19 changed files with 12303 additions and 3759 deletions

View File

@@ -16,6 +16,13 @@
The license applies to all entries newer than 2009-04-28.
*/
2012-02-07 22:25 UTC+0100 Viktor Szakats (harbour syenar.net)
* src/3rd/pcre/Makefile
* src/3rd/pcre/*
+ src/3rd/pcre/pcrejitc.c
* 8.12 -> 8.21 (using hb3rdpat)
; NOTE: New JIT feature off by default, not enabled
2012-02-07 22:10 UTC+0100 Viktor Szakats (harbour syenar.net)
* src/3rd/jpeg/Makefile
* src/3rd/jpeg/*

View File

@@ -9,7 +9,9 @@ specified below. The documentation for PCRE, supplied in the "doc"
directory, is distributed under the same terms as the software itself.
The basic library functions are written in C and are freestanding. Also
included in the distribution is a set of C++ wrapper functions.
included in the distribution is a set of C++ wrapper functions, and a
just-in-time compiler that can be used to optimize pattern matching. These
are both optional features that can be omitted when the library is built.
THE BASIC LIBRARY FUNCTIONS
@@ -22,7 +24,29 @@ Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England.
Copyright (c) 1997-2010 University of Cambridge
Copyright (c) 1997-2011 University of Cambridge
All rights reserved.
PCRE JUST-IN-TIME COMPILATION SUPPORT
-------------------------------------
Written by: Zoltan Herczeg
Email local part: hzmester
Emain domain: freemail.hu
Copyright(c) 2010-2011 Zoltan Herczeg
All rights reserved.
STACK-LESS JUST-IN-TIME COMPILER
--------------------------------
Written by: Zoltan Herczeg
Email local part: hzmester
Emain domain: freemail.hu
Copyright(c) 2009-2011 Zoltan Herczeg
All rights reserved.
@@ -31,7 +55,7 @@ THE C++ WRAPPER FUNCTIONS
Contributed by: Google Inc.
Copyright (c) 2007-2010, Google Inc.
Copyright (c) 2007-2011, Google Inc.
All rights reserved.

View File

@@ -19,6 +19,7 @@ C_SOURCES := \
pcreget.c \
pcreglob.c \
pcreinfo.c \
pcrejitc.c \
pcremktb.c \
pcrenewl.c \
pcreoutf.c \
@@ -72,8 +73,8 @@ else
endif
# ORIGIN http://www.pcre.org/
# VER 8.12
# URL ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.12.tar.gz
# VER 8.21
# URL ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.21.tar.gz
# DIFF pcre.dif
#
# MAP LICENCE
@@ -90,6 +91,7 @@ endif
# MAP pcre_get.c pcreget.c
# MAP pcre_globals.c pcreglob.c
# MAP pcre_info.c pcreinfo.c
# MAP pcre_jit_compile.c pcrejitc.c
# MAP pcre_maketables.c pcremktb.c
# MAP pcre_newline.c pcrenewl.c
# MAP pcre_ord2utf8.c pcreoutf.c

View File

@@ -250,7 +250,7 @@ them both to 0; an emulation function will be used. */
#define PACKAGE_NAME "PCRE"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "PCRE 8.12"
#define PACKAGE_STRING "PCRE 8.21"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "pcre"
@@ -259,7 +259,17 @@ them both to 0; an emulation function will be used. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "8.12"
#define PACKAGE_VERSION "8.21"
/* The value of PCREGREP_BUFSIZE determines the size of buffer used by
pcregrep to hold parts of the file it is searching. On systems that support
it, "configure" can be used to override the default, which is 8192. This is
also the minimum value. The actual amount of memory used by pcregrep is
three times this number, because it allows for the buffering of "before"
and "after" lines. */
#ifndef PCREGREP_BUFSIZE
#define PCREGREP_BUFSIZE 20480
#endif
/* If you are compiling for a system other than a Unix-like system or
@@ -293,6 +303,9 @@ them both to 0; an emulation function will be used. */
#define STDC_HEADERS 1
#endif
/* Define to enable support for Just-In-Time compiling. */
/* #undef SUPPORT_JIT */
/* Define to allow pcregrep to be linked with libbz2, so that it is able to
handle .bz2 files. */
/* #undef SUPPORT_LIBBZ2 */
@@ -304,7 +317,10 @@ them both to 0; an emulation function will be used. */
handle .gz files. */
/* #undef SUPPORT_LIBZ */
/* Define to enable support for Unicode properties */
/* Define to enable JIT support in pcregrep. */
/* #undef SUPPORT_PCREGREP_JIT */
/* Define to enable support for Unicode properties. */
/* #undef SUPPORT_UCP */
/* Define to enable support for the UTF-8 Unicode encoding. This will work
@@ -315,7 +331,7 @@ them both to 0; an emulation function will be used. */
/* Version number of package */
#ifndef VERSION
#define VERSION "8.12"
#define VERSION "8.21"
#endif
/* Define to empty if `const' does not conform to ANSI C. */

View File

@@ -1,7 +1,7 @@
diff -urN pcre.orig/pcrefinf.c pcre/pcrefinf.c
--- pcre.orig/pcrefinf.c 2011-01-15 18:09:50.426241817 +0100
+++ pcre/pcrefinf.c 2011-01-15 18:09:50.696242981 +0100
@@ -126,7 +126,7 @@
diff -urN pcre.orig\pcrefinf.c pcre\pcrefinf.c
--- pcre.orig\pcrefinf.c Tue Feb 07 22:22:32 2012
+++ pcre\pcrefinf.c Tue Feb 07 22:22:32 2012
@@ -139,7 +139,7 @@
case PCRE_INFO_MINLENGTH:
*((int *)where) =
(study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0)?
@@ -9,10 +9,10 @@ diff -urN pcre.orig/pcrefinf.c pcre/pcrefinf.c
+ (int)study->minlength : -1;
break;
case PCRE_INFO_LASTLITERAL:
diff -urN pcre.orig/pcreglob.c pcre/pcreglob.c
--- pcre.orig/pcreglob.c 2011-01-15 18:09:50.446241418 +0100
+++ pcre/pcreglob.c 2011-01-15 18:09:50.696242981 +0100
case PCRE_INFO_JIT:
diff -urN pcre.orig\pcreglob.c pcre\pcreglob.c
--- pcre.orig\pcreglob.c Tue Feb 07 22:22:32 2012
+++ pcre\pcreglob.c Tue Feb 07 22:22:32 2012
@@ -74,11 +74,17 @@
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;

View File

@@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, to be #included by
applications that call the PCRE functions.
Copyright (c) 1997-2010 University of Cambridge
Copyright (c) 1997-2011 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
/* The current PCRE version information. */
#define PCRE_MAJOR 8
#define PCRE_MINOR 12
#define PCRE_MINOR 21
#define PCRE_PRERELEASE
#define PCRE_DATE 2011-01-15
#define PCRE_DATE 2011-12-12
/* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE, the appropriate
@@ -98,20 +98,25 @@ extern "C" {
/* Options. Some are compile-time only, some are run-time only, and some are
both, so we keep them all distinct. However, almost all the bits in the options
word are now used. In the long run, we may have to re-use some of the
compile-time only bits for runtime options, or vice versa. */
compile-time only bits for runtime options, or vice versa. In the comments
below, "compile", "exec", and "DFA exec" mean that the option is permitted to
be set for those functions; "used in" means that an option may be set only for
compile, but is subsequently referenced in exec and/or DFA exec. Any of the
compile-time options may be inspected during studying (and therefore JIT
compiling). */
#define PCRE_CASELESS 0x00000001 /* Compile */
#define PCRE_MULTILINE 0x00000002 /* Compile */
#define PCRE_DOTALL 0x00000004 /* Compile */
#define PCRE_EXTENDED 0x00000008 /* Compile */
#define PCRE_ANCHORED 0x00000010 /* Compile, exec, DFA exec */
#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile */
#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile, used in exec, DFA exec */
#define PCRE_EXTRA 0x00000040 /* Compile */
#define PCRE_NOTBOL 0x00000080 /* Exec, DFA exec */
#define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */
#define PCRE_UNGREEDY 0x00000200 /* Compile */
#define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */
#define PCRE_UTF8 0x00000800 /* Compile */
#define PCRE_UTF8 0x00000800 /* Compile, used in exec, DFA exec */
#define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */
#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */
#define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */
@@ -119,7 +124,7 @@ compile-time only bits for runtime options, or vice versa. */
#define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */
#define PCRE_DFA_SHORTEST 0x00010000 /* DFA exec */
#define PCRE_DFA_RESTART 0x00020000 /* DFA exec */
#define PCRE_FIRSTLINE 0x00040000 /* Compile */
#define PCRE_FIRSTLINE 0x00040000 /* Compile, used in exec, DFA exec */
#define PCRE_DUPNAMES 0x00080000 /* Compile */
#define PCRE_NEWLINE_CR 0x00100000 /* Compile, exec, DFA exec */
#define PCRE_NEWLINE_LF 0x00200000 /* Compile, exec, DFA exec */
@@ -128,12 +133,12 @@ compile-time only bits for runtime options, or vice versa. */
#define PCRE_NEWLINE_ANYCRLF 0x00500000 /* Compile, exec, DFA exec */
#define PCRE_BSR_ANYCRLF 0x00800000 /* Compile, exec, DFA exec */
#define PCRE_BSR_UNICODE 0x01000000 /* Compile, exec, DFA exec */
#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile */
#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile, used in exec */
#define PCRE_NO_START_OPTIMIZE 0x04000000 /* Compile, exec, DFA exec */
#define PCRE_NO_START_OPTIMISE 0x04000000 /* Synonym */
#define PCRE_PARTIAL_HARD 0x08000000 /* Exec, DFA exec */
#define PCRE_NOTEMPTY_ATSTART 0x10000000 /* Exec, DFA exec */
#define PCRE_UCP 0x20000000 /* Compile */
#define PCRE_UCP 0x20000000 /* Compile, used in exec, DFA exec */
/* Exec-time and get/set-time error codes */
@@ -163,6 +168,33 @@ compile-time only bits for runtime options, or vice versa. */
#define PCRE_ERROR_BADNEWLINE (-23)
#define PCRE_ERROR_BADOFFSET (-24)
#define PCRE_ERROR_SHORTUTF8 (-25)
#define PCRE_ERROR_RECURSELOOP (-26)
#define PCRE_ERROR_JIT_STACKLIMIT (-27)
/* Specific error codes for UTF-8 validity checks */
#define PCRE_UTF8_ERR0 0
#define PCRE_UTF8_ERR1 1
#define PCRE_UTF8_ERR2 2
#define PCRE_UTF8_ERR3 3
#define PCRE_UTF8_ERR4 4
#define PCRE_UTF8_ERR5 5
#define PCRE_UTF8_ERR6 6
#define PCRE_UTF8_ERR7 7
#define PCRE_UTF8_ERR8 8
#define PCRE_UTF8_ERR9 9
#define PCRE_UTF8_ERR10 10
#define PCRE_UTF8_ERR11 11
#define PCRE_UTF8_ERR12 12
#define PCRE_UTF8_ERR13 13
#define PCRE_UTF8_ERR14 14
#define PCRE_UTF8_ERR15 15
#define PCRE_UTF8_ERR16 16
#define PCRE_UTF8_ERR17 17
#define PCRE_UTF8_ERR18 18
#define PCRE_UTF8_ERR19 19
#define PCRE_UTF8_ERR20 20
#define PCRE_UTF8_ERR21 21
/* Request types for pcre_fullinfo() */
@@ -183,6 +215,8 @@ compile-time only bits for runtime options, or vice versa. */
#define PCRE_INFO_JCHANGED 13
#define PCRE_INFO_HASCRORLF 14
#define PCRE_INFO_MINLENGTH 15
#define PCRE_INFO_JIT 16
#define PCRE_INFO_JITSIZE 17
/* Request types for pcre_config(). Do not re-arrange, in order to remain
compatible. */
@@ -196,6 +230,12 @@ compatible. */
#define PCRE_CONFIG_UNICODE_PROPERTIES 6
#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
#define PCRE_CONFIG_BSR 8
#define PCRE_CONFIG_JIT 9
/* Request types for pcre_study(). Do not re-arrange, in order to remain
compatible. */
#define PCRE_STUDY_JIT_COMPILE 0x0001
/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine
these bits, just add new ones on the end, in order to remain compatible. */
@@ -206,12 +246,16 @@ these bits, just add new ones on the end, in order to remain compatible. */
#define PCRE_EXTRA_TABLES 0x0008
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
#define PCRE_EXTRA_MARK 0x0020
#define PCRE_EXTRA_EXECUTABLE_JIT 0x0040
/* Types */
struct real_pcre; /* declaration; the definition is private */
typedef struct real_pcre pcre;
struct real_pcre_jit_stack; /* declaration; the definition is private */
typedef struct real_pcre_jit_stack pcre_jit_stack;
/* When PCRE is compiled as a C++ library, the subject pointer type can be
replaced with a custom type. For conventional use, the public interface is a
const char *. */
@@ -232,6 +276,7 @@ typedef struct pcre_extra {
const unsigned char *tables; /* Pointer to character tables */
unsigned long int match_limit_recursion; /* Max recursive calls to match() */
unsigned char **mark; /* For passing back a mark pointer */
void *executable_jit; /* Contains a pointer to a compiled jit code */
} pcre_extra;
/* The structure for passing out data via the pcre_callout_function. We use a
@@ -254,6 +299,8 @@ typedef struct pcre_callout_block {
/* ------------------- Added for Version 1 -------------------------- */
int pattern_position; /* Offset to next item in the pattern */
int next_item_length; /* Length of next item in the pattern */
/* ------------------- Added for Version 2 -------------------------- */
const unsigned char *mark; /* Pointer to current mark or NULL */
/* ------------------------------------------------------------------ */
} pcre_callout_block;
@@ -277,6 +324,10 @@ PCRE_EXP_DECL void pcre_stack_free(void *);
PCRE_EXP_DECL int pcre_callout(pcre_callout_block *);
#endif /* VPCOMPAT */
/* User defined callback which provides a stack just before the match starts. */
typedef pcre_jit_stack *(*pcre_jit_callback)(void *);
/* Exported PCRE functions */
PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *,
@@ -309,8 +360,15 @@ PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *);
PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
PCRE_EXP_DECL int pcre_refcount(pcre *, int);
PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
PCRE_EXP_DECL void pcre_free_study(pcre_extra *);
PCRE_EXP_DECL const char *pcre_version(void);
/* JIT compiler related functions. */
PCRE_EXP_DECL pcre_jit_stack *pcre_jit_stack_alloc(int, int);
PCRE_EXP_DECL void pcre_jit_stack_free(pcre_jit_stack *);
PCRE_EXP_DECL void pcre_assign_jit_stack(pcre_extra *, pcre_jit_callback, void *);
#ifdef __cplusplus
} /* extern "C" */
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2009 University of Cambridge
Copyright (c) 1997-2011 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -83,6 +83,14 @@ switch (what)
#endif
break;
case PCRE_CONFIG_JIT:
#ifdef SUPPORT_JIT
*((int *)where) = 1;
#else
*((int *)where) = 0;
#endif
break;
case PCRE_CONFIG_NEWLINE:
*((int *)where) = NEWLINE;
break;

View File

@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language (but see
below for why this module is different).
Written by Philip Hazel
Copyright (c) 1997-2010 University of Cambridge
Copyright (c) 1997-2011 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -121,18 +121,25 @@ static const uschar coptable[] = {
0, 0, /* \P, \p */
0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
0, /* \X */
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
1, /* Char */
1, /* Charnc */
1, /* Chari */
1, /* not */
1, /* noti */
/* Positive single-char repeats */
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
3, 3, 3, /* upto, minupto, exact */
1, 1, 1, 3, /* *+, ++, ?+, upto+ */
1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
3, 3, 3, /* upto I, minupto I, exact I */
1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
/* Negative single-char repeats - only for chars < 256 */
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
3, 3, 3, /* NOT upto, minupto, exact */
1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
3, 3, 3, /* NOT upto I, minupto I, exact I */
1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
/* Positive type repeats */
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
3, 3, 3, /* Type upto, minupto, exact */
@@ -144,26 +151,30 @@ static const uschar coptable[] = {
0, /* NCLASS */
0, /* XCLASS - variable length */
0, /* REF */
0, /* REFI */
0, /* RECURSE */
0, /* CALLOUT */
0, /* Alt */
0, /* Ket */
0, /* KetRmax */
0, /* KetRmin */
0, /* KetRpos */
0, /* Reverse */
0, /* Assert */
0, /* Assert not */
0, /* Assert behind */
0, /* Assert behind not */
0, /* Reverse */
0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
0, 0, 0, /* SBRA, SCBRA, SCOND */
0, 0, /* ONCE, ONCE_NC */
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
0, 0, /* CREF, NCREF */
0, 0, /* RREF, NRREF */
0, /* DEF */
0, 0, /* BRAZERO, BRAMINZERO */
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
0, 0 /* CLOSE, SKIPZERO */
};
/* This table identifies those opcodes that inspect a character. It is used to
@@ -179,18 +190,25 @@ static const uschar poptable[] = {
1, 1, /* \P, \p */
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
1, /* \X */
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
1, /* Char */
1, /* Charnc */
1, /* Chari */
1, /* not */
1, /* noti */
/* Positive single-char repeats */
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
1, 1, 1, /* upto, minupto, exact */
1, 1, 1, 1, /* *+, ++, ?+, upto+ */
1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
1, 1, 1, /* upto I, minupto I, exact I */
1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
/* Negative single-char repeats - only for chars < 256 */
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
1, 1, 1, /* NOT upto, minupto, exact */
1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
1, 1, 1, /* NOT upto I, minupto I, exact I */
1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
/* Positive type repeats */
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
1, 1, 1, /* Type upto, minupto, exact */
@@ -202,26 +220,30 @@ static const uschar poptable[] = {
1, /* NCLASS */
1, /* XCLASS - variable length */
0, /* REF */
0, /* REFI */
0, /* RECURSE */
0, /* CALLOUT */
0, /* Alt */
0, /* Ket */
0, /* KetRmax */
0, /* KetRmin */
0, /* KetRpos */
0, /* Reverse */
0, /* Assert */
0, /* Assert not */
0, /* Assert behind */
0, /* Assert behind not */
0, /* Reverse */
0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
0, 0, 0, /* SBRA, SCBRA, SCOND */
0, 0, /* ONCE, ONCE_NC */
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
0, 0, /* CREF, NCREF */
0, 0, /* RREF, NRREF */
0, /* DEF */
0, 0, /* BRAZERO, BRAMINZERO */
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
0, 0 /* CLOSE, SKIPZERO */
};
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
@@ -252,7 +274,6 @@ these structures in, is a vector of ints. */
typedef struct stateblock {
int offset; /* Offset to opcode */
int count; /* Count for repeats */
int ims; /* ims flag bits */
int data; /* Some use extra data */
} stateblock;
@@ -308,9 +329,7 @@ Arguments:
offsetcount size of same
workspace vector of workspace
wscount size of same
ims the current ims flags
rlevel function call recursion level
recursing regex recursive call level
Returns: > 0 => number of match offset pairs placed in offsets
= 0 => offsets overflowed; longest matches are present
@@ -325,7 +344,6 @@ for the current character, one for the following character). */
{ \
next_active_state->offset = (x); \
next_active_state->count = (y); \
next_active_state->ims = ims; \
next_active_state++; \
DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
} \
@@ -336,7 +354,6 @@ for the current character, one for the following character). */
{ \
next_active_state->offset = (x); \
next_active_state->count = (y); \
next_active_state->ims = ims; \
next_active_state->data = (z); \
next_active_state++; \
DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
@@ -348,7 +365,6 @@ for the current character, one for the following character). */
{ \
next_new_state->offset = (x); \
next_new_state->count = (y); \
next_new_state->ims = ims; \
next_new_state++; \
DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
} \
@@ -359,7 +375,6 @@ for the current character, one for the following character). */
{ \
next_new_state->offset = (x); \
next_new_state->count = (y); \
next_new_state->ims = ims; \
next_new_state->data = (z); \
next_new_state++; \
DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
@@ -378,9 +393,7 @@ internal_dfa_exec(
int offsetcount,
int *workspace,
int wscount,
int ims,
int rlevel,
int recursing)
int rlevel)
{
stateblock *active_states, *new_states, *temp_states;
stateblock *next_active_state, *next_new_state;
@@ -389,6 +402,8 @@ const uschar *ctypes, *lcc, *fcc;
const uschar *ptr;
const uschar *end_code, *first_op;
dfa_recursion_info new_recursive;
int active_count, new_count, match_count;
/* Some fields in the md block are frequently referenced, so we load them into
@@ -412,8 +427,8 @@ wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
(2 * INTS_PER_STATEBLOCK);
DPRINTF(("\n%.*s---------------------\n"
"%.*sCall to internal_dfa_exec f=%d r=%d\n",
rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
"%.*sCall to internal_dfa_exec f=%d\n",
rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
ctypes = md->tables + ctypes_offset;
lcc = md->tables + lcc_offset;
@@ -426,7 +441,8 @@ next_new_state = new_states = active_states + wscount;
new_count = 0;
first_op = this_start_code + 1 + LINK_SIZE +
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
*this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
/* The first thing in any (sub) pattern is a bracket of some sort. Push all
the alternative states onto the list, and find out where the end is. This
@@ -525,7 +541,9 @@ else
else
{
int length = 1 + LINK_SIZE +
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
*this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
2:0);
do
{
ADD_NEW((int)(end_code - start_code + length), 0);
@@ -605,6 +623,7 @@ for (;;)
for (i = 0; i < active_count; i++)
{
stateblock *current_state = active_states + i;
BOOL caseless = FALSE;
const uschar *code;
int state_offset = current_state->offset;
int count, codevalue, rrc;
@@ -616,10 +635,6 @@ for (;;)
else printf("0x%02x\n", c);
#endif
/* This variable is referred to implicity in the ADD_xxx macros. */
ims = current_state->ims;
/* A negative offset is a special case meaning "hold off going to this
(negated) state until the number of characters in the data field have
been skipped". */
@@ -725,7 +740,12 @@ for (;;)
/* ========================================================================== */
/* Reached a closing bracket. If not at the end of the pattern, carry
on with the next opcode. Otherwise, unless we have an empty string and
on with the next opcode. For repeating opcodes, also add the repeat
state. Note that KETRPOS will always be encountered at the end of the
subpattern, because the possessive subpattern repeats are always handled
using recursive calls. Thus, it never adds any new states.
At the end of the (sub)pattern, unless we have an empty string and
PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
start of the subject, save the match data, shifting up all previous
matches so we always have the longest first. */
@@ -733,6 +753,7 @@ for (;;)
case OP_KET:
case OP_KETRMIN:
case OP_KETRMAX:
case OP_KETRPOS:
if (code != end_code)
{
ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
@@ -749,7 +770,7 @@ for (;;)
current_subject > start_subject + md->start_offset)))
{
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
else if (match_count > 0 && ++match_count * 2 >= offsetcount)
else if (match_count > 0 && ++match_count * 2 > offsetcount)
match_count = 0;
count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
@@ -822,10 +843,14 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_CIRC:
if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_CIRCM:
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
((ims & PCRE_MULTILINE) != 0 &&
ptr != end_subject &&
WAS_NEWLINE(ptr)))
(ptr != end_subject && WAS_NEWLINE(ptr)))
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
@@ -839,12 +864,6 @@ for (;;)
}
break;
/*-----------------------------------------------------------------*/
case OP_OPT:
ims = code[1];
ADD_ACTIVE(state_offset + 2, 0);
break;
/*-----------------------------------------------------------------*/
case OP_SOD:
if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
@@ -890,11 +909,23 @@ for (;;)
could_continue = TRUE;
else if (clen == 0 ||
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
(ptr == end_subject - md->nllen)
))
{ ADD_ACTIVE(state_offset + 1, 0); }
}
else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
break;
/*-----------------------------------------------------------------*/
case OP_DOLLM:
if ((md->moptions & PCRE_NOTEOL) == 0)
{
if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
could_continue = TRUE;
else if (clen == 0 ||
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
{ ADD_ACTIVE(state_offset + 1, 0); }
}
else if (IS_NEWLINE(ptr))
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
@@ -1950,7 +1981,7 @@ for (;;)
break;
/*-----------------------------------------------------------------*/
case OP_CHARNC:
case OP_CHARI:
if (clen == 0) break;
#ifdef SUPPORT_UTF8
@@ -2136,19 +2167,35 @@ for (;;)
break;
/*-----------------------------------------------------------------*/
/* Match a negated single character. This is only used for one-byte
characters, that is, we know that d < 256. The character we are
/* Match a negated single character casefully. This is only used for
one-byte characters, that is, we know that d < 256. The character we are
checking (c) can be multibyte. */
case OP_NOT:
if (clen > 0)
{
unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
}
if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
break;
/*-----------------------------------------------------------------*/
/* Match a negated single character caselessly. This is only used for
one-byte characters, that is, we know that d < 256. The character we are
checking (c) can be multibyte. */
case OP_NOTI:
if (clen > 0 && c != d && c != fcc[d])
{ ADD_NEW(state_offset + dlen + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_PLUSI:
case OP_MINPLUSI:
case OP_POSPLUSI:
case OP_NOTPLUSI:
case OP_NOTMINPLUSI:
case OP_NOTPOSPLUSI:
caseless = TRUE;
codevalue -= OP_STARI - OP_STAR;
/* Fall through */
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
@@ -2160,7 +2207,7 @@ for (;;)
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
if ((ims & PCRE_CASELESS) != 0)
if (caseless)
{
#ifdef SUPPORT_UTF8
if (utf8 && d >= 128)
@@ -2188,6 +2235,15 @@ for (;;)
break;
/*-----------------------------------------------------------------*/
case OP_QUERYI:
case OP_MINQUERYI:
case OP_POSQUERYI:
case OP_NOTQUERYI:
case OP_NOTMINQUERYI:
case OP_NOTPOSQUERYI:
caseless = TRUE;
codevalue -= OP_STARI - OP_STAR;
/* Fall through */
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
@@ -2198,7 +2254,7 @@ for (;;)
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
if ((ims & PCRE_CASELESS) != 0)
if (caseless)
{
#ifdef SUPPORT_UTF8
if (utf8 && d >= 128)
@@ -2224,6 +2280,15 @@ for (;;)
break;
/*-----------------------------------------------------------------*/
case OP_STARI:
case OP_MINSTARI:
case OP_POSSTARI:
case OP_NOTSTARI:
case OP_NOTMINSTARI:
case OP_NOTPOSSTARI:
caseless = TRUE;
codevalue -= OP_STARI - OP_STAR;
/* Fall through */
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
@@ -2234,7 +2299,7 @@ for (;;)
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
if ((ims & PCRE_CASELESS) != 0)
if (caseless)
{
#ifdef SUPPORT_UTF8
if (utf8 && d >= 128)
@@ -2260,13 +2325,18 @@ for (;;)
break;
/*-----------------------------------------------------------------*/
case OP_EXACTI:
case OP_NOTEXACTI:
caseless = TRUE;
codevalue -= OP_STARI - OP_STAR;
/* Fall through */
case OP_EXACT:
case OP_NOTEXACT:
count = current_state->count; /* Number already matched */
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
if ((ims & PCRE_CASELESS) != 0)
if (caseless)
{
#ifdef SUPPORT_UTF8
if (utf8 && d >= 128)
@@ -2290,6 +2360,15 @@ for (;;)
break;
/*-----------------------------------------------------------------*/
case OP_UPTOI:
case OP_MINUPTOI:
case OP_POSUPTOI:
case OP_NOTUPTOI:
case OP_NOTMINUPTOI:
case OP_NOTPOSUPTOI:
caseless = TRUE;
codevalue -= OP_STARI - OP_STAR;
/* Fall through */
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
@@ -2301,7 +2380,7 @@ for (;;)
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
if ((ims & PCRE_CASELESS) != 0)
if (caseless)
{
#ifdef SUPPORT_UTF8
if (utf8 && d >= 128)
@@ -2444,9 +2523,7 @@ for (;;)
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
ims, /* the current ims flags */
rlevel, /* function recursion level */
recursing); /* pass on regex recursion */
rlevel); /* function recursion level */
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
@@ -2485,6 +2562,7 @@ for (;;)
cb.capture_top = 1;
cb.capture_last = -1;
cb.callout_data = md->callout_data;
cb.mark = NULL; /* No (*MARK) support */
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
}
if (rrc > 0) break; /* Fail this thread */
@@ -2511,7 +2589,7 @@ for (;;)
{
int value = GET2(code, LINK_SIZE+2);
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
if (recursing > 0)
if (md->recursive != NULL)
{ ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
}
@@ -2535,9 +2613,7 @@ for (;;)
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
ims, /* the current ims flags */
rlevel, /* function recursion level */
recursing); /* pass on regex recursion */
rlevel); /* function recursion level */
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) ==
@@ -2552,28 +2628,47 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_RECURSE:
{
dfa_recursion_info *ri;
int local_offsets[1000];
int local_workspace[1000];
const uschar *callpat = start_code + GET(code, 1);
int recno = (callpat == md->start_code)? 0 :
GET2(callpat, 1 + LINK_SIZE);
int rc;
DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
recursing + 1));
DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
/* Check for repeating a recursion without advancing the subject
pointer. This should catch convoluted mutual recursions. (Some simple
cases are caught at compile time.) */
for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
if (recno == ri->group_num && ptr == ri->subject_position)
return PCRE_ERROR_RECURSELOOP;
/* Remember this recursion and where we started it so as to
catch infinite loops. */
new_recursive.group_num = recno;
new_recursive.subject_position = ptr;
new_recursive.prevrec = md->recursive;
md->recursive = &new_recursive;
rc = internal_dfa_exec(
md, /* fixed match data */
start_code + GET(code, 1), /* this subexpression's code */
callpat, /* this subexpression's code */
ptr, /* where we currently are */
(int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
ims, /* the current ims flags */
rlevel, /* function recursion level */
recursing + 1); /* regex recurse level */
rlevel); /* function recursion level */
DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
recursing + 1, rc));
md->recursive = new_recursive.prevrec; /* Done this recursion */
DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
rc));
/* Ran out of internal offsets */
@@ -2605,8 +2700,98 @@ for (;;)
}
break;
/*-----------------------------------------------------------------*/
case OP_BRAPOS:
case OP_SBRAPOS:
case OP_CBRAPOS:
case OP_SCBRAPOS:
case OP_BRAPOSZERO:
{
int charcount, matched_count;
const uschar *local_ptr = ptr;
BOOL allow_zero;
if (codevalue == OP_BRAPOSZERO)
{
allow_zero = TRUE;
codevalue = *(++code); /* Codevalue will be one of above BRAs */
}
else allow_zero = FALSE;
/* Loop to match the subpattern as many times as possible as if it were
a complete pattern. */
for (matched_count = 0;; matched_count++)
{
int local_offsets[2];
int local_workspace[1000];
int rc = internal_dfa_exec(
md, /* fixed match data */
code, /* this subexpression's code */
local_ptr, /* where we currently are */
(int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
rlevel); /* function recursion level */
/* Failed to match */
if (rc < 0)
{
if (rc != PCRE_ERROR_NOMATCH) return rc;
break;
}
/* Matched: break the loop if zero characters matched. */
charcount = local_offsets[1] - local_offsets[0];
if (charcount == 0) break;
local_ptr += charcount; /* Advance temporary position ptr */
}
/* At this point we have matched the subpattern matched_count
times, and local_ptr is pointing to the character after the end of the
last match. */
if (matched_count > 0 || allow_zero)
{
const uschar *end_subpattern = code;
int next_state_offset;
do { end_subpattern += GET(end_subpattern, 1); }
while (*end_subpattern == OP_ALT);
next_state_offset =
(int)(end_subpattern - start_code + LINK_SIZE + 1);
/* Optimization: if there are no more active states, and there
are no new states yet set up, then skip over the subject string
right here, to save looping. Otherwise, set up the new state to swing
into action when the end of the matched substring is reached. */
if (i + 1 >= active_count && new_count == 0)
{
ptr = local_ptr;
clen = 0;
ADD_NEW(next_state_offset, 0);
}
else
{
const uschar *p = ptr;
const uschar *pp = local_ptr;
charcount = (int)(pp - p);
while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
}
}
}
break;
/*-----------------------------------------------------------------*/
case OP_ONCE:
case OP_ONCE_NC:
{
int local_offsets[2];
int local_workspace[1000];
@@ -2620,9 +2805,7 @@ for (;;)
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
ims, /* the current ims flags */
rlevel, /* function recursion level */
recursing); /* pass on regex recursion */
rlevel); /* function recursion level */
if (rc >= 0)
{
@@ -2656,7 +2839,7 @@ for (;;)
/* Optimization: if there are no more active states, and there
are no new states yet set up, then skip over the subject string
right here, to save looping. Otherwise, set up the new state to swing
into action when the end of the substring is reached. */
into action when the end of the matched substring is reached. */
else if (i + 1 >= active_count && new_count == 0)
{
@@ -2686,7 +2869,6 @@ for (;;)
if (repeat_state_offset >= 0)
{ ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
}
}
else if (rc != PCRE_ERROR_NOMATCH) return rc;
}
@@ -2713,6 +2895,7 @@ for (;;)
cb.capture_top = 1;
cb.capture_last = -1;
cb.callout_data = md->callout_data;
cb.mark = NULL; /* No (*MARK) support */
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
}
if (rrc == 0)
@@ -2963,15 +3146,21 @@ back the character offset. */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
int tb;
if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0)
return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)?
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
int erroroffset;
int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
if (errorcode != 0)
{
tb = ((USPTR)subject)[start_offset] & 0xc0;
if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
if (offsetcount >= 2)
{
offsets[0] = erroroffset;
offsets[1] = errorcode;
}
return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
}
if (start_offset > 0 && start_offset < length &&
(((USPTR)subject)[start_offset] & 0xc0) == 0x80)
return PCRE_ERROR_BADUTF8_OFFSET;
}
#endif
@@ -3141,7 +3330,7 @@ for (;;)
disabling is explicitly requested (and of course, by the test above, this
code is not obeyed when restarting after a partial match). */
if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
(options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
{
/* If the pattern was studied, a minimum subject length may be set. This
@@ -3209,6 +3398,7 @@ for (;;)
/* OK, now we can do the business */
md->start_used_ptr = current_subject;
md->recursive = NULL;
rc = internal_dfa_exec(
md, /* fixed match data */
@@ -3219,9 +3409,7 @@ for (;;)
offsetcount, /* size of same */
workspace, /* workspace vector */
wscount, /* size of same */
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
0, /* function recurse level */
0); /* regex recurse level */
0); /* function recurse level */
/* Anything other than "no match" means we are done, always; otherwise, carry
on only if not anchored. */

File diff suppressed because it is too large Load Diff

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2009 University of Cambridge
Copyright (c) 1997-2011 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -100,6 +100,19 @@ switch (what)
*((size_t *)where) = (study == NULL)? 0 : study->size;
break;
case PCRE_INFO_JITSIZE:
#ifdef SUPPORT_JIT
*((size_t *)where) =
(extra_data != NULL &&
(extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
extra_data->executable_jit != NULL)?
_pcre_jit_get_size(extra_data->executable_jit) : 0;
#else
*((size_t *)where) = 0;
#endif
break;
case PCRE_INFO_CAPTURECOUNT:
*((int *)where) = re->top_bracket;
break;
@@ -129,6 +142,12 @@ switch (what)
(int)study->minlength : -1;
break;
case PCRE_INFO_JIT:
*((int *)where) = extra_data != NULL &&
(extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
extra_data->executable_jit != NULL;
break;
case PCRE_INFO_LASTLITERAL:
*((int *)where) =
((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1;

View File

@@ -7,7 +7,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2010 University of Cambridge
Copyright (c) 1997-2011 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -582,10 +582,6 @@ Standard C system should have one. */
#endif
/* These are the public options that can change during matching. */
#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
/* Private flags containing information about the compiled regex. They used to
live at the top end of the options word, but that got almost full, so now they
are in a 16-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as
@@ -598,11 +594,12 @@ compatibility. */
#define PCRE_STARTLINE 0x0008 /* start after \n for multiline */
#define PCRE_JCHANGED 0x0010 /* j option used in regex */
#define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */
#define PCRE_HASTHEN 0x0040 /* pattern contains (*THEN) */
/* Options for the "extra" block produced by pcre_study(). */
/* Flags for the "extra" block produced by pcre_study(). */
#define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
#define PCRE_STUDY_MINLEN 0x02 /* a minimum length field exists */
#define PCRE_STUDY_MAPPED 0x0001 /* a map of starting chars exists */
#define PCRE_STUDY_MINLEN 0x0002 /* a minimum length field exists */
/* Masks for identifying the public options that are permitted at compile
time, run time, or study time, respectively. */
@@ -628,7 +625,8 @@ time, run time, or study time, respectively. */
PCRE_DFA_RESTART|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
PCRE_NO_START_OPTIMIZE)
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
#define PUBLIC_STUDY_OPTIONS \
PCRE_STUDY_JIT_COMPILE
/* Magic number to provide a small check against being handed junk. Also used
to detect whether a pattern was compiled on a host of different endianness. */
@@ -1254,8 +1252,8 @@ value such as \n. They must have non-zero values, as check_escape() returns
their negation. Also, they must appear in the same order as in the opcode
definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
corresponds to "." in DOTALL mode rather than an escape sequence. It is also
used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
like \N.
used for [^] in JavaScript compatibility mode, and for \C in non-utf8 mode. In
non-DOTALL mode, "." behaves like \N.
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
@@ -1299,6 +1297,7 @@ enum {
OP_WHITESPACE, /* 9 \s */
OP_NOT_WORDCHAR, /* 10 \W */
OP_WORDCHAR, /* 11 \w */
OP_ANY, /* 12 Match any character except newline */
OP_ALLANY, /* 13 Match any character */
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
@@ -1313,141 +1312,205 @@ enum {
OP_EODN, /* 23 End of data or \n at end of data: \Z. */
OP_EOD, /* 24 End of data: \z */
OP_OPT, /* 25 Set runtime options */
OP_CIRC, /* 26 Start of line - varies with multiline switch */
OP_DOLL, /* 27 End of line - varies with multiline switch */
OP_CHAR, /* 28 Match one character, casefully */
OP_CHARNC, /* 29 Match one character, caselessly */
OP_NOT, /* 30 Match one character, not the following one */
OP_CIRC, /* 25 Start of line - not multiline */
OP_CIRCM, /* 26 Start of line - multiline */
OP_DOLL, /* 27 End of line - not multiline */
OP_DOLLM, /* 28 End of line - multiline */
OP_CHAR, /* 29 Match one character, casefully */
OP_CHARI, /* 30 Match one character, caselessly */
OP_NOT, /* 31 Match one character, not the given one, casefully */
OP_NOTI, /* 32 Match one character, not the given one, caselessly */
OP_STAR, /* 31 The maximizing and minimizing versions of */
OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */
OP_PLUS, /* 33 the minimizing one second. */
OP_MINPLUS, /* 34 This first set applies to single characters.*/
OP_QUERY, /* 35 */
OP_MINQUERY, /* 36 */
/* The following sets of 13 opcodes must always be kept in step because
the offset from the first one is used to generate the others. */
OP_UPTO, /* 37 From 0 to n matches */
OP_MINUPTO, /* 38 */
OP_EXACT, /* 39 Exactly n matches */
/**** Single characters, caseful, must precede the caseless ones ****/
OP_POSSTAR, /* 40 Possessified star */
OP_POSPLUS, /* 41 Possessified plus */
OP_POSQUERY, /* 42 Posesssified query */
OP_POSUPTO, /* 43 Possessified upto */
OP_STAR, /* 33 The maximizing and minimizing versions of */
OP_MINSTAR, /* 34 these six opcodes must come in pairs, with */
OP_PLUS, /* 35 the minimizing one second. */
OP_MINPLUS, /* 36 */
OP_QUERY, /* 37 */
OP_MINQUERY, /* 38 */
OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */
OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */
OP_NOTPLUS, /* 46 the minimizing one second. They must be in */
OP_NOTMINPLUS, /* 47 exactly the same order as those above. */
OP_NOTQUERY, /* 48 This set applies to "not" single characters. */
OP_NOTMINQUERY, /* 49 */
OP_UPTO, /* 39 From 0 to n matches of one character, caseful*/
OP_MINUPTO, /* 40 */
OP_EXACT, /* 41 Exactly n matches */
OP_NOTUPTO, /* 50 From 0 to n matches */
OP_NOTMINUPTO, /* 51 */
OP_NOTEXACT, /* 52 Exactly n matches */
OP_POSSTAR, /* 42 Possessified star, caseful */
OP_POSPLUS, /* 43 Possessified plus, caseful */
OP_POSQUERY, /* 44 Posesssified query, caseful */
OP_POSUPTO, /* 45 Possessified upto, caseful */
OP_NOTPOSSTAR, /* 53 Possessified versions */
OP_NOTPOSPLUS, /* 54 */
OP_NOTPOSQUERY, /* 55 */
OP_NOTPOSUPTO, /* 56 */
/**** Single characters, caseless, must follow the caseful ones */
OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */
OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */
OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */
OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */
OP_TYPEQUERY, /* 61 This set applies to character types such as \d */
OP_TYPEMINQUERY, /* 62 */
OP_STARI, /* 46 */
OP_MINSTARI, /* 47 */
OP_PLUSI, /* 48 */
OP_MINPLUSI, /* 49 */
OP_QUERYI, /* 50 */
OP_MINQUERYI, /* 51 */
OP_TYPEUPTO, /* 63 From 0 to n matches */
OP_TYPEMINUPTO, /* 64 */
OP_TYPEEXACT, /* 65 Exactly n matches */
OP_UPTOI, /* 52 From 0 to n matches of one character, caseless */
OP_MINUPTOI, /* 53 */
OP_EXACTI, /* 54 */
OP_TYPEPOSSTAR, /* 66 Possessified versions */
OP_TYPEPOSPLUS, /* 67 */
OP_TYPEPOSQUERY, /* 68 */
OP_TYPEPOSUPTO, /* 69 */
OP_POSSTARI, /* 55 Possessified star, caseless */
OP_POSPLUSI, /* 56 Possessified plus, caseless */
OP_POSQUERYI, /* 57 Posesssified query, caseless */
OP_POSUPTOI, /* 58 Possessified upto, caseless */
OP_CRSTAR, /* 70 The maximizing and minimizing versions of */
OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */
OP_CRPLUS, /* 72 the minimizing one second. These codes must */
OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */
OP_CRQUERY, /* 74 These are for character classes and back refs */
OP_CRMINQUERY, /* 75 */
OP_CRRANGE, /* 76 These are different to the three sets above. */
OP_CRMINRANGE, /* 77 */
/**** The negated ones must follow the non-negated ones, and match them ****/
/**** Negated single character, caseful; must precede the caseless ones ****/
OP_CLASS, /* 78 Match a character class, chars < 256 only */
OP_NCLASS, /* 79 Same, but the bitmap was created from a negative
class - the difference is relevant only when a UTF-8
character > 255 is encountered. */
OP_NOTSTAR, /* 59 The maximizing and minimizing versions of */
OP_NOTMINSTAR, /* 60 these six opcodes must come in pairs, with */
OP_NOTPLUS, /* 61 the minimizing one second. They must be in */
OP_NOTMINPLUS, /* 62 exactly the same order as those above. */
OP_NOTQUERY, /* 63 */
OP_NOTMINQUERY, /* 64 */
OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the
class. This does both positive and negative. */
OP_NOTUPTO, /* 65 From 0 to n matches, caseful */
OP_NOTMINUPTO, /* 66 */
OP_NOTEXACT, /* 67 Exactly n matches */
OP_REF, /* 81 Match a back reference */
OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */
OP_CALLOUT, /* 83 Call out to external function if provided */
OP_NOTPOSSTAR, /* 68 Possessified versions, caseful */
OP_NOTPOSPLUS, /* 69 */
OP_NOTPOSQUERY, /* 70 */
OP_NOTPOSUPTO, /* 71 */
OP_ALT, /* 84 Start of alternation */
OP_KET, /* 85 End of group that doesn't have an unbounded repeat */
OP_KETRMAX, /* 86 These two must remain together and in this */
OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */
/**** Negated single character, caseless; must follow the caseful ones ****/
/* The assertions must come before BRA, CBRA, ONCE, and COND.*/
OP_NOTSTARI, /* 72 */
OP_NOTMINSTARI, /* 73 */
OP_NOTPLUSI, /* 74 */
OP_NOTMINPLUSI, /* 75 */
OP_NOTQUERYI, /* 76 */
OP_NOTMINQUERYI, /* 77 */
OP_ASSERT, /* 88 Positive lookahead */
OP_ASSERT_NOT, /* 89 Negative lookahead */
OP_ASSERTBACK, /* 90 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */
OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */
OP_NOTUPTOI, /* 78 From 0 to n matches, caseless */
OP_NOTMINUPTOI, /* 79 */
OP_NOTEXACTI, /* 80 Exactly n matches */
/* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
as there's a test for >= ONCE for a subpattern that isn't an assertion. */
OP_NOTPOSSTARI, /* 81 Possessified versions, caseless */
OP_NOTPOSPLUSI, /* 82 */
OP_NOTPOSQUERYI, /* 83 */
OP_NOTPOSUPTOI, /* 84 */
OP_ONCE, /* 93 Atomic group */
OP_BRA, /* 94 Start of non-capturing bracket */
OP_CBRA, /* 95 Start of capturing bracket */
OP_COND, /* 96 Conditional group */
/**** Character types ****/
/* These three must follow the previous three, in the same order. There's a
OP_TYPESTAR, /* 85 The maximizing and minimizing versions of */
OP_TYPEMINSTAR, /* 86 these six opcodes must come in pairs, with */
OP_TYPEPLUS, /* 87 the minimizing one second. These codes must */
OP_TYPEMINPLUS, /* 88 be in exactly the same order as those above. */
OP_TYPEQUERY, /* 89 */
OP_TYPEMINQUERY, /* 90 */
OP_TYPEUPTO, /* 91 From 0 to n matches */
OP_TYPEMINUPTO, /* 92 */
OP_TYPEEXACT, /* 93 Exactly n matches */
OP_TYPEPOSSTAR, /* 94 Possessified versions */
OP_TYPEPOSPLUS, /* 95 */
OP_TYPEPOSQUERY, /* 96 */
OP_TYPEPOSUPTO, /* 97 */
/* These are used for character classes and back references; only the
first six are the same as the sets above. */
OP_CRSTAR, /* 98 The maximizing and minimizing versions of */
OP_CRMINSTAR, /* 99 all these opcodes must come in pairs, with */
OP_CRPLUS, /* 100 the minimizing one second. These codes must */
OP_CRMINPLUS, /* 101 be in exactly the same order as those above. */
OP_CRQUERY, /* 102 */
OP_CRMINQUERY, /* 103 */
OP_CRRANGE, /* 104 These are different to the three sets above. */
OP_CRMINRANGE, /* 105 */
/* End of quantifier opcodes */
OP_CLASS, /* 106 Match a character class, chars < 256 only */
OP_NCLASS, /* 107 Same, but the bitmap was created from a negative
class - the difference is relevant only when a
UTF-8 character > 255 is encountered. */
OP_XCLASS, /* 108 Extended class for handling UTF-8 chars within the
class. This does both positive and negative. */
OP_REF, /* 109 Match a back reference, casefully */
OP_REFI, /* 110 Match a back reference, caselessly */
OP_RECURSE, /* 111 Match a numbered subpattern (possibly recursive) */
OP_CALLOUT, /* 112 Call out to external function if provided */
OP_ALT, /* 113 Start of alternation */
OP_KET, /* 114 End of group that doesn't have an unbounded repeat */
OP_KETRMAX, /* 115 These two must remain together and in this */
OP_KETRMIN, /* 116 order. They are for groups the repeat for ever. */
OP_KETRPOS, /* 117 Possessive unlimited repeat. */
/* The assertions must come before BRA, CBRA, ONCE, and COND, and the four
asserts must remain in order. */
OP_REVERSE, /* 118 Move pointer back - used in lookbehind assertions */
OP_ASSERT, /* 119 Positive lookahead */
OP_ASSERT_NOT, /* 120 Negative lookahead */
OP_ASSERTBACK, /* 121 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 122 Negative lookbehind */
/* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately
after the assertions, with ONCE first, as there's a test for >= ONCE for a
subpattern that isn't an assertion. The POS versions must immediately follow
the non-POS versions in each case. */
OP_ONCE, /* 123 Atomic group, contains captures */
OP_ONCE_NC, /* 124 Atomic group containing no captures */
OP_BRA, /* 125 Start of non-capturing bracket */
OP_BRAPOS, /* 126 Ditto, with unlimited, possessive repeat */
OP_CBRA, /* 127 Start of capturing bracket */
OP_CBRAPOS, /* 128 Ditto, with unlimited, possessive repeat */
OP_COND, /* 129 Conditional group */
/* These five must follow the previous five, in the same order. There's a
check for >= SBRA to distinguish the two sets. */
OP_SBRA, /* 97 Start of non-capturing bracket, check empty */
OP_SCBRA, /* 98 Start of capturing bracket, check empty */
OP_SCOND, /* 99 Conditional group, check empty */
OP_SBRA, /* 130 Start of non-capturing bracket, check empty */
OP_SBRAPOS, /* 131 Ditto, with unlimited, possessive repeat */
OP_SCBRA, /* 132 Start of capturing bracket, check empty */
OP_SCBRAPOS, /* 133 Ditto, with unlimited, possessive repeat */
OP_SCOND, /* 134 Conditional group, check empty */
/* The next two pairs must (respectively) be kept together. */
OP_CREF, /* 100 Used to hold a capture number as condition */
OP_NCREF, /* 101 Same, but generaged by a name reference*/
OP_RREF, /* 102 Used to hold a recursion number as condition */
OP_NRREF, /* 103 Same, but generaged by a name reference*/
OP_DEF, /* 104 The DEFINE condition */
OP_CREF, /* 135 Used to hold a capture number as condition */
OP_NCREF, /* 136 Same, but generated by a name reference*/
OP_RREF, /* 137 Used to hold a recursion number as condition */
OP_NRREF, /* 138 Same, but generated by a name reference*/
OP_DEF, /* 139 The DEFINE condition */
OP_BRAZERO, /* 105 These two must remain together and in this */
OP_BRAMINZERO, /* 106 order. */
OP_BRAZERO, /* 140 These two must remain together and in this */
OP_BRAMINZERO, /* 141 order. */
OP_BRAPOSZERO, /* 142 */
/* These are backtracking control verbs */
OP_MARK, /* 107 always has an argument */
OP_PRUNE, /* 108 */
OP_PRUNE_ARG, /* 109 same, but with argument */
OP_SKIP, /* 110 */
OP_SKIP_ARG, /* 111 same, but with argument */
OP_THEN, /* 112 */
OP_THEN_ARG, /* 113 same, but with argument */
OP_COMMIT, /* 114 */
OP_MARK, /* 143 always has an argument */
OP_PRUNE, /* 144 */
OP_PRUNE_ARG, /* 145 same, but with argument */
OP_SKIP, /* 146 */
OP_SKIP_ARG, /* 147 same, but with argument */
OP_THEN, /* 148 */
OP_THEN_ARG, /* 149 same, but with argument */
OP_COMMIT, /* 150 */
/* These are forced failure and success verbs */
OP_FAIL, /* 115 */
OP_ACCEPT, /* 116 */
OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */
OP_FAIL, /* 151 */
OP_ACCEPT, /* 152 */
OP_ASSERT_ACCEPT, /* 153 Used inside assertions */
OP_CLOSE, /* 154 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
OP_SKIPZERO, /* 118 */
OP_SKIPZERO, /* 155 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
@@ -1462,29 +1525,45 @@ called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
/* This macro defines textual names for all the opcodes. These are used only
for debugging. The macro is referenced only in pcre_printint.c. */
for debugging, and some of them are only partial names. The macro is referenced
only in pcre_printint.c, which fills out the full names in many cases (and in
some cases doesn't actually use these names at all). */
#define OP_NAME_LIST \
"End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
"\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
"notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
"extuni", "\\Z", "\\z", \
"Opt", "^", "$", "char", "charnc", "not", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"^", "^", "$", "$", "char", "chari", "not", "noti", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", \
"class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
"AssertB", "AssertB not", "Reverse", \
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
"class", "nclass", "xclass", "Ref", "Refi", \
"Recurse", "Callout", \
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
"Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \
"Once", "Once_NC", \
"Bra", "BraPos", "CBra", "CBraPos", \
"Cond", \
"SBra", "SBraPos", "SCBra", "SCBraPos", \
"SCond", \
"Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
"Brazero", "Braminzero", \
"Brazero", "Braminzero", "Braposzero", \
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
"*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
"*THEN", "*THEN", "*COMMIT", "*FAIL", \
"*ACCEPT", "*ASSERT_ACCEPT", \
"Close", "Skip zero"
@@ -1505,18 +1584,25 @@ in UTF-8 mode. The code that uses this table must know about such things. */
3, 3, /* \P, \p */ \
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
1, /* \X */ \
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
1, 1, 1, 1, 1, 1, /* \Z, \z, ^, ^M, $, $M */ \
2, /* Char - the minimum length */ \
2, /* Charnc - the minimum length */ \
2, /* Chari - the minimum length */ \
2, /* not */ \
/* Positive single-char repeats ** These are */ \
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
2, /* noti */ \
/* Positive single-char repeats ** These are */ \
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
4, 4, 4, /* upto, minupto, exact ** mode */ \
2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \
2, 2, 2, 2, 2, 2, /* *I, *?I, +I, +?I, ?I, ??I ** UTF-8 */ \
4, 4, 4, /* upto I, minupto I, exact I */ \
2, 2, 2, 4, /* *+I, ++I, ?+I, upto+I */ \
/* Negative single-char repeats - only for chars < 256 */ \
2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
4, 4, 4, /* NOT upto, minupto, exact */ \
2, 2, 2, 4, /* Possessive *, +, ?, upto */ \
2, 2, 2, 4, /* Possessive NOT *, +, ?, upto */ \
2, 2, 2, 2, 2, 2, /* NOT *I, *?I, +I, +?I, ?I, ??I */ \
4, 4, 4, /* NOT upto I, minupto I, exact I */ \
2, 2, 2, 4, /* Possessive NOT *I, +I, ?I, upto I */ \
/* Positive type repeats */ \
2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
4, 4, 4, /* Type upto, minupto, exact */ \
@@ -1528,33 +1614,40 @@ in UTF-8 mode. The code that uses this table must know about such things. */
33, /* NCLASS */ \
0, /* XCLASS - variable length */ \
3, /* REF */ \
3, /* REFI */ \
1+LINK_SIZE, /* RECURSE */ \
2+2*LINK_SIZE, /* CALLOUT */ \
1+LINK_SIZE, /* Alt */ \
1+LINK_SIZE, /* Ket */ \
1+LINK_SIZE, /* KetRmax */ \
1+LINK_SIZE, /* KetRmin */ \
1+LINK_SIZE, /* KetRpos */ \
1+LINK_SIZE, /* Reverse */ \
1+LINK_SIZE, /* Assert */ \
1+LINK_SIZE, /* Assert not */ \
1+LINK_SIZE, /* Assert behind */ \
1+LINK_SIZE, /* Assert behind not */ \
1+LINK_SIZE, /* Reverse */ \
1+LINK_SIZE, /* ONCE */ \
1+LINK_SIZE, /* ONCE_NC */ \
1+LINK_SIZE, /* BRA */ \
1+LINK_SIZE, /* BRAPOS */ \
3+LINK_SIZE, /* CBRA */ \
3+LINK_SIZE, /* CBRAPOS */ \
1+LINK_SIZE, /* COND */ \
1+LINK_SIZE, /* SBRA */ \
1+LINK_SIZE, /* SBRAPOS */ \
3+LINK_SIZE, /* SCBRA */ \
3+LINK_SIZE, /* SCBRAPOS */ \
1+LINK_SIZE, /* SCOND */ \
3, 3, /* CREF, NCREF */ \
3, 3, /* RREF, NRREF */ \
1, /* DEF */ \
1, 1, /* BRAZERO, BRAMINZERO */ \
1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \
3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
1, 3, /* SKIP, SKIP_ARG */ \
1+LINK_SIZE, 3+LINK_SIZE, /* THEN, THEN_ARG */ \
1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
1, 3, /* THEN, THEN_ARG */ \
1, 1, 1, 1, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ \
3, 1 /* CLOSE, SKIPZERO */
/* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
condition. */
@@ -1571,8 +1664,8 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68,
ERRCOUNT };
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,
ERR70, ERR71, ERR72, ERRCOUNT };
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit
@@ -1648,10 +1741,12 @@ typedef struct compile_data {
uschar *name_table; /* The name/number table */
int names_found; /* Number of entries so far */
int name_entry_size; /* Size of each entry */
int workspace_size; /* Size of workspace */
int bracount; /* Count of capturing parens as we compile */
int final_bracount; /* Saved value after first pass */
int top_backref; /* Maximum back reference */
unsigned int backref_map; /* Bitmap of low back refs */
int assert_depth; /* Depth of nested assertions */
int external_options; /* External (initial) options */
int external_flags; /* External flag bits to be set */
int req_varyopt; /* "After variable item" flag for reqbyte */
@@ -1663,7 +1758,7 @@ typedef struct compile_data {
} compile_data;
/* Structure for maintaining a chain of pointers to the currently incomplete
branches, for testing for left recursion. */
branches, for testing for left recursion while compiling. */
typedef struct branch_chain {
struct branch_chain *outer;
@@ -1671,20 +1766,28 @@ typedef struct branch_chain {
} branch_chain;
/* Structure for items in a linked list that represents an explicit recursive
call within the pattern. */
call within the pattern; used by pcre_exec(). */
typedef struct recursion_info {
struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
int group_num; /* Number of group that was called */
const uschar *after_call; /* "Return value": points after the call in the expr */
int *offset_save; /* Pointer to start of saved offsets */
int saved_max; /* Number of saved offsets */
int save_offset_top; /* Current value of offset_top */
int group_num; /* Number of group that was called */
int *offset_save; /* Pointer to start of saved offsets */
int saved_max; /* Number of saved offsets */
USPTR subject_position; /* Position at start of recursion */
} recursion_info;
/* A similar structure for pcre_dfa_exec(). */
typedef struct dfa_recursion_info {
struct dfa_recursion_info *prevrec;
int group_num;
USPTR subject_position;
} dfa_recursion_info;
/* Structure for building a chain of data for holding the values of the subject
pointer at the start of each subpattern, so as to detect when an empty string
has been matched by a subpattern - to break infinite loops. */
has been matched by a subpattern - to break infinite loops; used by
pcre_exec(). */
typedef struct eptrblock {
struct eptrblock *epb_prev;
@@ -1708,8 +1811,8 @@ typedef struct match_data {
int name_entry_size; /* Size of entry in names table */
uschar *name_table; /* Table of names */
uschar nl[4]; /* Newline string when fixed */
const uschar *lcc; /* Points to lower casing table */
const uschar *ctypes; /* Points to table of type maps */
const uschar *lcc; /* Points to lower casing table */
const uschar *ctypes; /* Points to table of type maps */
BOOL offset_overflow; /* Set if too many extractions */
BOOL notbol; /* NOTBOL flag */
BOOL noteol; /* NOTEOL flag */
@@ -1721,7 +1824,9 @@ typedef struct match_data {
BOOL notempty_atstart; /* Empty string match at start not wanted */
BOOL hitend; /* Hit the end of the subject at some point */
BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */
const uschar *start_code; /* For use when recursing */
BOOL hasthen; /* Pattern contains (*THEN) */
BOOL ignore_skip_arg; /* For re-run when SKIP name not found */
const uschar *start_code; /* For use when recursing */
USPTR start_subject; /* Start of the subject string */
USPTR end_subject; /* End of the subject string */
USPTR start_match_ptr; /* Start of matched string */
@@ -1731,29 +1836,33 @@ typedef struct match_data {
int end_offset_top; /* Highwater mark at end of match */
int capture_last; /* Most recent capture number */
int start_offset; /* The start offset value */
int match_function_type; /* Set for certain special calls of MATCH() */
eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */
int eptrn; /* Next free eptrblock */
recursion_info *recursive; /* Linked list of recursion data */
void *callout_data; /* To pass back to callouts */
const uschar *mark; /* Mark pointer to pass back */
const uschar *mark; /* Mark pointer to pass back on success */
const uschar *nomatch_mark; /* Mark pointer to pass back on failure */
const uschar *once_target; /* Where to back up to for atomic groups */
} match_data;
/* A similar structure is used for the same purpose by the DFA matching
functions. */
typedef struct dfa_match_data {
const uschar *start_code; /* Start of the compiled pattern */
const uschar *start_subject; /* Start of the subject string */
const uschar *end_subject; /* End of subject string */
const uschar *start_used_ptr; /* Earliest consulted character */
const uschar *tables; /* Character tables */
int start_offset; /* The start offset value */
int moptions; /* Match options */
int poptions; /* Pattern options */
int nltype; /* Newline type */
int nllen; /* Newline string length */
uschar nl[4]; /* Newline string when fixed */
void *callout_data; /* To pass back to callouts */
const uschar *start_code; /* Start of the compiled pattern */
const uschar *start_subject; /* Start of the subject string */
const uschar *end_subject; /* End of subject string */
const uschar *start_used_ptr; /* Earliest consulted character */
const uschar *tables; /* Character tables */
int start_offset; /* The start offset value */
int moptions; /* Match options */
int poptions; /* Pattern options */
int nltype; /* Newline type */
int nllen; /* Newline string length */
uschar nl[4]; /* Newline string when fixed */
void *callout_data; /* To pass back to callouts */
dfa_recursion_info *recursive; /* Linked list of recursion data */
} dfa_match_data;
/* Bit definitions for entries in the pcre_ctypes table. */
@@ -1811,6 +1920,10 @@ extern const int _pcre_utf8_table2[];
extern const int _pcre_utf8_table3[];
extern const uschar _pcre_utf8_table4[];
#ifdef SUPPORT_JIT
extern const uschar _pcre_utf8_char_sizes[];
#endif
extern const int _pcre_utf8_table1_size;
extern const char _pcre_utt_names[];
@@ -1831,10 +1944,17 @@ extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
extern int _pcre_ord2utf8(int, uschar *);
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
const pcre_study_data *, pcre_study_data *);
extern int _pcre_valid_utf8(USPTR, int);
extern int _pcre_valid_utf8(USPTR, int, int *);
extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
extern BOOL _pcre_xclass(int, const uschar *);
#ifdef SUPPORT_JIT
extern void _pcre_jit_compile(const real_pcre *, pcre_extra *);
extern int _pcre_jit_exec(const real_pcre *, void *, PCRE_SPTR,
int, int, int, int, int *, int);
extern void _pcre_jit_free(void *);
extern int _pcre_jit_get_size(void *);
#endif
/* Unicode character database (UCD) */
@@ -1848,14 +1968,16 @@ extern const ucd_record _pcre_ucd_records[];
extern const uschar _pcre_ucd_stage1[];
extern const pcre_uint16 _pcre_ucd_stage2[];
extern const int _pcre_ucp_gentype[];
#ifdef SUPPORT_JIT
extern const int _pcre_ucp_typerange[];
#endif
/* UCD access macros */
#define UCD_BLOCK_SIZE 128
#define GET_UCD(ch) (_pcre_ucd_records + \
_pcre_ucd_stage2[_pcre_ucd_stage1[(ch) / UCD_BLOCK_SIZE] * \
UCD_BLOCK_SIZE + ch % UCD_BLOCK_SIZE])
UCD_BLOCK_SIZE + (ch) % UCD_BLOCK_SIZE])
#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
#define UCD_SCRIPT(ch) GET_UCD(ch)->script

File diff suppressed because it is too large Load Diff

View File

@@ -180,6 +180,7 @@ utf8 = (options & PCRE_UTF8) != 0;
for(;;)
{
uschar *ccode;
const char *flag = " ";
int c;
int extra = 0;
@@ -214,10 +215,6 @@ for(;;)
fprintf(f, "------------------------------------------------------------------\n");
return;
case OP_OPT:
fprintf(f, " %.2x %s", code[1], OP_names[*code]);
break;
case OP_CHAR:
fprintf(f, " ");
do
@@ -229,28 +226,33 @@ for(;;)
fprintf(f, "\n");
continue;
case OP_CHARNC:
fprintf(f, " NC ");
case OP_CHARI:
fprintf(f, " /i ");
do
{
code++;
code += 1 + print_char(f, code, utf8);
}
while (*code == OP_CHARNC);
while (*code == OP_CHARI);
fprintf(f, "\n");
continue;
case OP_CBRA:
case OP_CBRAPOS:
case OP_SCBRA:
case OP_SCBRAPOS:
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
else fprintf(f, " ");
fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
break;
case OP_BRA:
case OP_BRAPOS:
case OP_SBRA:
case OP_SBRAPOS:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_KETRPOS:
case OP_ALT:
case OP_KET:
case OP_ASSERT:
@@ -258,6 +260,7 @@ for(;;)
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
case OP_ONCE:
case OP_ONCE_NC:
case OP_COND:
case OP_SCOND:
case OP_REVERSE:
@@ -295,6 +298,17 @@ for(;;)
fprintf(f, " Cond def");
break;
case OP_STARI:
case OP_MINSTARI:
case OP_POSSTARI:
case OP_PLUSI:
case OP_MINPLUSI:
case OP_POSPLUSI:
case OP_QUERYI:
case OP_MINQUERYI:
case OP_POSQUERYI:
flag = "/i";
/* Fall through */
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
@@ -313,7 +327,7 @@ for(;;)
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSQUERY:
fprintf(f, " ");
fprintf(f, " %s ", flag);
if (*code >= OP_TYPESTAR)
{
fprintf(f, "%s", OP_names[code[1]]);
@@ -327,17 +341,23 @@ for(;;)
fprintf(f, "%s", OP_names[*code]);
break;
case OP_EXACTI:
case OP_UPTOI:
case OP_MINUPTOI:
case OP_POSUPTOI:
flag = "/i";
/* Fall through */
case OP_EXACT:
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
fprintf(f, " ");
fprintf(f, " %s ", flag);
extra = print_char(f, code+3, utf8);
fprintf(f, "{");
if (*code != OP_EXACT) fprintf(f, "0,");
if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
fprintf(f, "%d}", GET2(code,1));
if (*code == OP_MINUPTO) fprintf(f, "?");
else if (*code == OP_POSUPTO) fprintf(f, "+");
if (*code == OP_MINUPTO || *code == OP_MINUPTOI) fprintf(f, "?");
else if (*code == OP_POSUPTO || *code == OP_POSUPTOI) fprintf(f, "+");
break;
case OP_TYPEEXACT:
@@ -357,12 +377,27 @@ for(;;)
else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
break;
case OP_NOTI:
flag = "/i";
/* Fall through */
case OP_NOT:
c = code[1];
if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
else fprintf(f, " [^\\x%02x]", c);
if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c);
else fprintf(f, " %s [^\\x%02x]", flag, c);
break;
case OP_NOTSTARI:
case OP_NOTMINSTARI:
case OP_NOTPOSSTARI:
case OP_NOTPLUSI:
case OP_NOTMINPLUSI:
case OP_NOTPOSPLUSI:
case OP_NOTQUERYI:
case OP_NOTMINQUERYI:
case OP_NOTPOSQUERYI:
flag = "/i";
/* Fall through */
case OP_NOTSTAR:
case OP_NOTMINSTAR:
case OP_NOTPOSSTAR:
@@ -373,22 +408,30 @@ for(;;)
case OP_NOTMINQUERY:
case OP_NOTPOSQUERY:
c = code[1];
if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
else fprintf(f, " [^\\x%02x]", c);
if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c);
else fprintf(f, " %s [^\\x%02x]", flag, c);
fprintf(f, "%s", OP_names[*code]);
break;
case OP_NOTEXACTI:
case OP_NOTUPTOI:
case OP_NOTMINUPTOI:
case OP_NOTPOSUPTOI:
flag = "/i";
/* Fall through */
case OP_NOTEXACT:
case OP_NOTUPTO:
case OP_NOTMINUPTO:
case OP_NOTPOSUPTO:
c = code[3];
if (PRINTABLE(c)) fprintf(f, " [^%c]{", c);
else fprintf(f, " [^\\x%02x]{", c);
if (*code != OP_NOTEXACT) fprintf(f, "0,");
if (PRINTABLE(c)) fprintf(f, " %s [^%c]{", flag, c);
else fprintf(f, " %s [^\\x%02x]{", flag, c);
if (*code != OP_NOTEXACT && *code != OP_NOTEXACTI) fprintf(f, "0,");
fprintf(f, "%d}", GET2(code,1));
if (*code == OP_NOTMINUPTO) fprintf(f, "?");
else if (*code == OP_NOTPOSUPTO) fprintf(f, "+");
if (*code == OP_NOTMINUPTO || *code == OP_NOTMINUPTOI) fprintf(f, "?");
else
if (*code == OP_NOTPOSUPTO || *code == OP_NOTPOSUPTOI) fprintf(f, "+");
break;
case OP_RECURSE:
@@ -397,8 +440,11 @@ for(;;)
fprintf(f, "%s", OP_names[*code]);
break;
case OP_REFI:
flag = "/i";
/* Fall through */
case OP_REF:
fprintf(f, " \\%d", GET2(code,1));
fprintf(f, " %s \\%d", flag, GET2(code,1));
ccode = code + _pcre_OP_lengths[*code];
goto CLASS_REF_REPEAT;
@@ -542,25 +588,23 @@ for(;;)
break;
case OP_THEN:
if (print_lengths)
fprintf(f, " %s %d", OP_names[*code], GET(code, 1));
else
fprintf(f, " %s", OP_names[*code]);
fprintf(f, " %s", OP_names[*code]);
break;
case OP_THEN_ARG:
if (print_lengths)
fprintf(f, " %s %d %s", OP_names[*code], GET(code, 1),
code + 2 + LINK_SIZE);
else
fprintf(f, " %s %s", OP_names[*code], code + 2 + LINK_SIZE);
extra += code[1+LINK_SIZE];
fprintf(f, " %s %s", OP_names[*code], code + 2);
extra += code[1];
break;
/* Anything else is just an item with no data*/
case OP_CIRCM:
case OP_DOLLM:
flag = "/m";
/* Fall through */
/* Anything else is just an item with no data, but possibly a flag. */
default:
fprintf(f, " %s", OP_names[*code]);
fprintf(f, " %s %s", flag, OP_names[*code]);
break;
}

View File

@@ -52,7 +52,7 @@ supporting functions. */
/* Returns from set_start_bits() */
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
@@ -66,17 +66,20 @@ string of that length that matches. In UTF8 mode, the result is in characters
rather than bytes.
Arguments:
code pointer to start of group (the bracket)
startcode pointer to start of the whole pattern
options the compiling options
code pointer to start of group (the bracket)
startcode pointer to start of the whole pattern
options the compiling options
int RECURSE depth
Returns: the minimum length
-1 if \C was encountered
-1 if \C in UTF-8 mode or (*ACCEPT) was encountered
-2 internal error (missing capturing bracket)
-3 internal error (opcode not listed)
*/
static int
find_minlength(const uschar *code, const uschar *startcode, int options)
find_minlength(const uschar *code, const uschar *startcode, int options,
int recurse_depth)
{
int length = -1;
BOOL utf8 = (options & PCRE_UTF8) != 0;
@@ -84,7 +87,8 @@ BOOL had_recurse = FALSE;
register int branchlength = 0;
register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
if (*code == OP_CBRA || *code == OP_SCBRA ||
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
/* Scan along the opcodes for this branch. If we get to the end of the
branch, check the length against that of the other branches. */
@@ -118,26 +122,40 @@ for (;;)
case OP_SCBRA:
case OP_BRA:
case OP_SBRA:
case OP_CBRAPOS:
case OP_SCBRAPOS:
case OP_BRAPOS:
case OP_SBRAPOS:
case OP_ONCE:
d = find_minlength(cc, startcode, options);
case OP_ONCE_NC:
d = find_minlength(cc, startcode, options, recurse_depth);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
/* ACCEPT makes things far too complicated; we have to give up. */
case OP_ACCEPT:
case OP_ASSERT_ACCEPT:
return -1;
/* Reached end of a branch; if it's a ket it is the end of a nested
call. If it's ALT it is an alternation in a nested call. If it is
END it's the end of the outer call. All can be handled by the same code. */
call. If it's ALT it is an alternation in a nested call. If it is END it's
the end of the outer call. All can be handled by the same code. If an
ACCEPT was previously encountered, use the length that was in force at that
time, and pass back the shortest ACCEPT length. */
case OP_ALT:
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_KETRPOS:
case OP_END:
if (length < 0 || (!had_recurse && branchlength < length))
length = branchlength;
if (*cc != OP_ALT) return length;
if (op != OP_ALT) return length;
cc += 1 + LINK_SIZE;
branchlength = 0;
had_recurse = FALSE;
@@ -160,14 +178,15 @@ for (;;)
case OP_RREF:
case OP_NRREF:
case OP_DEF:
case OP_OPT:
case OP_CALLOUT:
case OP_SOD:
case OP_SOM:
case OP_EOD:
case OP_EODN:
case OP_CIRC:
case OP_CIRCM:
case OP_DOLL:
case OP_DOLLM:
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
cc += _pcre_OP_lengths[*cc];
@@ -177,6 +196,7 @@ for (;;)
case OP_BRAZERO:
case OP_BRAMINZERO:
case OP_BRAPOSZERO:
case OP_SKIPZERO:
cc += _pcre_OP_lengths[*cc];
do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -186,14 +206,21 @@ for (;;)
/* Handle literal characters and + repetitions */
case OP_CHAR:
case OP_CHARNC:
case OP_CHARI:
case OP_NOT:
case OP_NOTI:
case OP_PLUS:
case OP_PLUSI:
case OP_MINPLUS:
case OP_MINPLUSI:
case OP_POSPLUS:
case OP_POSPLUSI:
case OP_NOTPLUS:
case OP_NOTPLUSI:
case OP_NOTMINPLUS:
case OP_NOTMINPLUSI:
case OP_NOTPOSPLUS:
case OP_NOTPOSPLUSI:
branchlength++;
cc += 2;
#ifdef SUPPORT_UTF8
@@ -212,7 +239,9 @@ for (;;)
need to skip over a multibyte character in UTF8 mode. */
case OP_EXACT:
case OP_EXACTI:
case OP_NOTEXACT:
case OP_NOTEXACTI:
branchlength += GET2(cc,1);
cc += 4;
#ifdef SUPPORT_UTF8
@@ -249,14 +278,17 @@ for (;;)
cc++;
break;
/* "Any newline" might match two characters */
/* "Any newline" might match two characters, but it also might match just
one. */
case OP_ANYNL:
branchlength += 2;
branchlength += 1;
cc++;
break;
/* The single-byte matcher means we can't proceed in UTF-8 mode */
/* The single-byte matcher means we can't proceed in UTF-8 mode. (In
non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
appear, but leave the code, just in case.) */
case OP_ANYBYTE:
#ifdef SUPPORT_UTF8
@@ -337,6 +369,7 @@ for (;;)
that case we must set the minimum length to zero. */
case OP_REF:
case OP_REFI:
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
{
ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
@@ -347,7 +380,10 @@ for (;;)
d = 0;
had_recurse = TRUE;
}
else d = find_minlength(cs, startcode, options);
else
{
d = find_minlength(cs, startcode, options, recurse_depth);
}
}
else d = 0;
cc += 3;
@@ -364,6 +400,12 @@ for (;;)
cc++;
break;
case OP_CRPLUS:
case OP_CRMINPLUS:
min = 1;
cc++;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
min = GET2(cc, 1);
@@ -378,36 +420,68 @@ for (;;)
branchlength += min * d;
break;
/* We can easily detect direct recursion, but not mutual recursion. This is
caught by a recursion depth count. */
case OP_RECURSE:
cs = ce = (uschar *)startcode + GET(cc, 1);
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce)
if ((cc > cs && cc < ce) || recurse_depth > 10)
had_recurse = TRUE;
else
branchlength += find_minlength(cs, startcode, options);
{
branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
}
cc += 1 + LINK_SIZE;
break;
/* Anything else does not or need not match a character. We can get the
item's length from the table, but for those that can match zero occurrences
of a character, we must take special action for UTF-8 characters. */
of a character, we must take special action for UTF-8 characters. As it
happens, the "NOT" versions of these opcodes are used at present only for
ASCII characters, so they could be omitted from this list. However, in
future that may change, so we include them here so as not to leave a
gotcha for a future maintainer. */
case OP_UPTO:
case OP_UPTOI:
case OP_NOTUPTO:
case OP_NOTUPTOI:
case OP_MINUPTO:
case OP_MINUPTOI:
case OP_NOTMINUPTO:
case OP_NOTMINUPTOI:
case OP_POSUPTO:
case OP_POSUPTOI:
case OP_NOTPOSUPTO:
case OP_NOTPOSUPTOI:
case OP_STAR:
case OP_STARI:
case OP_NOTSTAR:
case OP_NOTSTARI:
case OP_MINSTAR:
case OP_MINSTARI:
case OP_NOTMINSTAR:
case OP_NOTMINSTARI:
case OP_POSSTAR:
case OP_POSSTARI:
case OP_NOTPOSSTAR:
case OP_NOTPOSSTARI:
case OP_QUERY:
case OP_QUERYI:
case OP_NOTQUERY:
case OP_NOTQUERYI:
case OP_MINQUERY:
case OP_MINQUERYI:
case OP_NOTMINQUERY:
case OP_NOTMINQUERYI:
case OP_POSQUERY:
case OP_POSQUERYI:
case OP_NOTPOSQUERY:
case OP_NOTPOSQUERYI:
cc += _pcre_OP_lengths[op];
#ifdef SUPPORT_UTF8
if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
@@ -419,20 +493,27 @@ for (;;)
case OP_MARK:
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
case OP_THEN_ARG:
cc += _pcre_OP_lengths[op] + cc[1];
break;
case OP_THEN_ARG:
cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
break;
/* The remaining opcodes are just skipped over. */
/* For the record, these are the opcodes that are matched by "default":
OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
OP_THEN. */
default:
case OP_CLOSE:
case OP_COMMIT:
case OP_FAIL:
case OP_PRUNE:
case OP_SET_SOM:
case OP_SKIP:
case OP_THEN:
cc += _pcre_OP_lengths[op];
break;
/* This should not occur: we list all opcodes explicitly so that when
new ones get added they are properly considered. */
default:
return -3;
}
}
/* Control never gets here */
@@ -578,18 +659,18 @@ function fails unless the result is SSB_DONE.
Arguments:
code points to an expression
start_bits points to a 32-byte table, initialized to 0
caseless the current state of the caseless flag
utf8 TRUE if in UTF-8 mode
cd the block with char table pointers
Returns: SSB_FAIL => Failed to find any starting bytes
SSB_DONE => Found mandatory starting bytes
SSB_CONTINUE => Found optional starting bytes
SSB_UNKNOWN => Hit an unrecognized opcode
*/
static int
set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
BOOL utf8, compile_data *cd)
set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
compile_data *cd)
{
register int c;
int yield = SSB_DONE;
@@ -614,19 +695,106 @@ volatile int dummy;
do
{
const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
BOOL try_next = TRUE;
const uschar *tcode = code + 1 + LINK_SIZE;
if (*code == OP_CBRA || *code == OP_SCBRA ||
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
while (try_next) /* Loop for items in this branch */
{
int rc;
switch(*tcode)
{
/* Fail if we reach something we don't understand */
/* If we reach something we don't understand, it means a new opcode has
been created that hasn't been added to this code. Hopefully this problem
will be discovered during testing. */
default:
return SSB_UNKNOWN;
/* Fail for a valid opcode that implies no starting bits. */
case OP_ACCEPT:
case OP_ASSERT_ACCEPT:
case OP_ALLANY:
case OP_ANY:
case OP_ANYBYTE:
case OP_CIRC:
case OP_CIRCM:
case OP_CLOSE:
case OP_COMMIT:
case OP_COND:
case OP_CREF:
case OP_DEF:
case OP_DOLL:
case OP_DOLLM:
case OP_END:
case OP_EOD:
case OP_EODN:
case OP_EXTUNI:
case OP_FAIL:
case OP_MARK:
case OP_NCREF:
case OP_NOT:
case OP_NOTEXACT:
case OP_NOTEXACTI:
case OP_NOTI:
case OP_NOTMINPLUS:
case OP_NOTMINPLUSI:
case OP_NOTMINQUERY:
case OP_NOTMINQUERYI:
case OP_NOTMINSTAR:
case OP_NOTMINSTARI:
case OP_NOTMINUPTO:
case OP_NOTMINUPTOI:
case OP_NOTPLUS:
case OP_NOTPLUSI:
case OP_NOTPOSPLUS:
case OP_NOTPOSPLUSI:
case OP_NOTPOSQUERY:
case OP_NOTPOSQUERYI:
case OP_NOTPOSSTAR:
case OP_NOTPOSSTARI:
case OP_NOTPOSUPTO:
case OP_NOTPOSUPTOI:
case OP_NOTPROP:
case OP_NOTQUERY:
case OP_NOTQUERYI:
case OP_NOTSTAR:
case OP_NOTSTARI:
case OP_NOTUPTO:
case OP_NOTUPTOI:
case OP_NOT_HSPACE:
case OP_NOT_VSPACE:
case OP_NRREF:
case OP_PROP:
case OP_PRUNE:
case OP_PRUNE_ARG:
case OP_RECURSE:
case OP_REF:
case OP_REFI:
case OP_REVERSE:
case OP_RREF:
case OP_SCOND:
case OP_SET_SOM:
case OP_SKIP:
case OP_SKIP_ARG:
case OP_SOD:
case OP_SOM:
case OP_THEN:
case OP_THEN_ARG:
case OP_XCLASS:
return SSB_FAIL;
/* We can ignore word boundary tests. */
case OP_WORD_BOUNDARY:
case OP_NOT_WORD_BOUNDARY:
tcode++;
break;
/* If we hit a bracket or a positive lookahead assertion, recurse to set
bits from within the subpattern. If it can't find anything, we have to
give up. If it finds some mandatory character(s), we are done for this
@@ -636,10 +804,15 @@ do
case OP_SBRA:
case OP_CBRA:
case OP_SCBRA:
case OP_BRAPOS:
case OP_SBRAPOS:
case OP_CBRAPOS:
case OP_SCBRAPOS:
case OP_ONCE:
case OP_ONCE_NC:
case OP_ASSERT:
rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
if (rc == SSB_FAIL) return SSB_FAIL;
rc = set_start_bits(tcode, start_bits, utf8, cd);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
if (rc == SSB_DONE) try_next = FALSE; else
{
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
@@ -662,6 +835,7 @@ do
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_KETRPOS:
return SSB_CONTINUE;
/* Skip over callout */
@@ -679,19 +853,13 @@ do
tcode += 1 + LINK_SIZE;
break;
/* Skip over an option setting, changing the caseless flag */
case OP_OPT:
caseless = (tcode[1] & PCRE_CASELESS) != 0;
tcode += 2;
break;
/* BRAZERO does the bracket, but carries on. */
case OP_BRAZERO:
case OP_BRAMINZERO:
if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
return SSB_FAIL;
case OP_BRAPOSZERO:
rc = set_start_bits(++tcode, start_bits, utf8, cd);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
/* =========================================================================
See the comment at the head of this function concerning the next line,
which was an old fudge for the benefit of OS/2.
@@ -717,7 +885,16 @@ do
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
break;
case OP_STARI:
case OP_MINSTARI:
case OP_POSSTARI:
case OP_QUERYI:
case OP_MINQUERYI:
case OP_POSQUERYI:
tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
break;
/* Single-char upto sets the bit and tries the next */
@@ -725,20 +902,36 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
break;
case OP_UPTOI:
case OP_MINUPTOI:
case OP_POSUPTOI:
tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
break;
/* At least one single char sets the bit and stops */
case OP_EXACT: /* Fall through */
case OP_EXACT:
tcode += 2;
/* Fall through */
case OP_CHAR:
case OP_CHARNC:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
(void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
(void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
try_next = FALSE;
break;
case OP_EXACTI:
tcode += 2;
/* Fall through */
case OP_CHARI:
case OP_PLUSI:
case OP_MINPLUSI:
case OP_POSPLUSI:
(void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
try_next = FALSE;
break;
@@ -968,7 +1161,8 @@ do
for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
}
/* Advance past the bit map, and act on what follows */
/* Advance past the bit map, and act on what follows. For a zero
minimum repeat, continue; otherwise stop processing. */
tcode += 32;
switch (*tcode)
@@ -1004,6 +1198,8 @@ return yield;
/*************************************************
* Study a compiled expression *
*************************************************/
@@ -1029,7 +1225,7 @@ pcre_study(const pcre *external_re, int options, const char **errorptr)
int min;
BOOL bits_set = FALSE;
uschar start_bits[32];
pcre_extra *extra;
pcre_extra *extra = NULL;
pcre_study_data *study;
const uschar *tables;
uschar *code;
@@ -1060,6 +1256,8 @@ seeking a list of starting bytes. */
if ((re->options & PCRE_ANCHORED) == 0 &&
(re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
{
int rc;
/* Set the character tables in the block that is passed around */
tables = re->tables;
@@ -1075,55 +1273,116 @@ if ((re->options & PCRE_ANCHORED) == 0 &&
/* See if we can find a fixed set of initial characters for the pattern. */
memset(start_bits, 0, 32 * sizeof(uschar));
bits_set = set_start_bits(code, start_bits,
(re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
&compile_block) == SSB_DONE;
rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
&compile_block);
bits_set = rc == SSB_DONE;
if (rc == SSB_UNKNOWN)
{
*errorptr = "internal error: opcode not recognized";
return NULL;
}
}
/* Find the minimum length of subject string. */
min = find_minlength(code, code, re->options);
/* Return NULL if no optimization is possible. */
if (!bits_set && min < 0) return NULL;
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
the latter, which is pointed to by the former, which may also get additional
data set later by the calling program. At the moment, the size of
pcre_study_data is fixed. We nevertheless save it in a field for returning via
the pcre_fullinfo() function so that if it becomes variable in the future, we
don't have to change that code. */
extra = (pcre_extra *)(pcre_malloc)
(sizeof(pcre_extra) + sizeof(pcre_study_data));
if (extra == NULL)
switch(min = find_minlength(code, code, re->options, 0))
{
*errorptr = "failed to get memory";
return NULL;
case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
default: break;
}
study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
extra->flags = PCRE_EXTRA_STUDY_DATA;
extra->study_data = study;
/* If a set of starting bytes has been identified, or if the minimum length is
greater than zero, or if JIT optimization has been requested, get a pcre_extra
block and a pcre_study_data block. The study data is put in the latter, which
is pointed to by the former, which may also get additional data set later by
the calling program. At the moment, the size of pcre_study_data is fixed. We
nevertheless save it in a field for returning via the pcre_fullinfo() function
so that if it becomes variable in the future, we don't have to change that
code. */
study->size = sizeof(pcre_study_data);
study->flags = 0;
if (bits_set)
if (bits_set || min > 0
#ifdef SUPPORT_JIT
|| (options & PCRE_STUDY_JIT_COMPILE) != 0
#endif
)
{
study->flags |= PCRE_STUDY_MAPPED;
memcpy(study->start_bits, start_bits, sizeof(start_bits));
}
extra = (pcre_extra *)(pcre_malloc)
(sizeof(pcre_extra) + sizeof(pcre_study_data));
if (extra == NULL)
{
*errorptr = "failed to get memory";
return NULL;
}
if (min >= 0)
{
study->flags |= PCRE_STUDY_MINLEN;
study->minlength = min;
study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
extra->flags = PCRE_EXTRA_STUDY_DATA;
extra->study_data = study;
study->size = sizeof(pcre_study_data);
study->flags = 0;
/* Set the start bits always, to avoid unset memory errors if the
study data is written to a file, but set the flag only if any of the bits
are set, to save time looking when none are. */
if (bits_set)
{
study->flags |= PCRE_STUDY_MAPPED;
memcpy(study->start_bits, start_bits, sizeof(start_bits));
}
else memset(study->start_bits, 0, 32 * sizeof(uschar));
/* Always set the minlength value in the block, because the JIT compiler
makes use of it. However, don't set the bit unless the length is greater than
zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
checking the zero case. */
if (min > 0)
{
study->flags |= PCRE_STUDY_MINLEN;
study->minlength = min;
}
else study->minlength = 0;
/* If JIT support was compiled and requested, attempt the JIT compilation.
If no starting bytes were found, and the minimum length is zero, and JIT
compilation fails, abandon the extra block and return NULL. */
#ifdef SUPPORT_JIT
extra->executable_jit = NULL;
if ((options & PCRE_STUDY_JIT_COMPILE) != 0) _pcre_jit_compile(re, extra);
if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
{
pcre_free_study(extra);
extra = NULL;
}
#endif
}
return extra;
}
/*************************************************
* Free the study data *
*************************************************/
/* This function frees the memory that was obtained by pcre_study().
Argument: a pointer to the pcre_extra block
Returns: nothing
*/
PCRE_EXP_DEFN void
pcre_free_study(pcre_extra *extra)
{
#ifdef SUPPORT_JIT
if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
extra->executable_jit != NULL)
_pcre_jit_free(extra->executable_jit);
#endif
pcre_free(extra);
}
/* End of pcre_study.c */

View File

@@ -87,6 +87,19 @@ const uschar _pcre_utf8_table4[] = {
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
#ifdef SUPPORT_JIT
/* Full table of the number of extra bytes when the
character code is greater or equal than 0xc0.
See _pcre_utf8_table4 above. */
const uschar _pcre_utf8_char_sizes[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,
};
#endif
/* Table to translate from particular type value to the general value. */
const int _pcre_ucp_gentype[] = {
@@ -100,6 +113,21 @@ const int _pcre_ucp_gentype[] = {
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
};
#ifdef SUPPORT_JIT
/* This table reverses _pcre_ucp_gentype. We can save the cost
of a memory load. */
const int _pcre_ucp_typerange[] = {
ucp_Cc, ucp_Cs,
ucp_Ll, ucp_Lu,
ucp_Mc, ucp_Mn,
ucp_Nd, ucp_No,
ucp_Pc, ucp_Ps,
ucp_Sc, ucp_So,
ucp_Zl, ucp_Zs,
};
#endif
/* The pcre_utt[] table below translates Unicode property names into type and
code values. It is searched by binary chop, so must be in collating sequence of
name. Originally, the table contained pointers to the name strings in the first
@@ -110,7 +138,7 @@ table itself. Maintenance is more error-prone, but frequent changes to this
data are unlikely.
July 2008: There is now a script called maint/GenerateUtt.py that can be used
to generate this data instead of maintaining it entirely by hand.
to generate this data automatically instead of maintaining it by hand.
The script was updated in March 2009 to generate a new EBCDIC-compliant
version. Like all other character and string literals that are compared against
@@ -123,8 +151,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0"
#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
#define STRING_Brahmi0 STR_B STR_r STR_a STR_h STR_m STR_i "\0"
#define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
#define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
@@ -186,6 +216,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
#define STRING_M0 STR_M "\0"
#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0"
#define STRING_Mc0 STR_M STR_c "\0"
#define STRING_Me0 STR_M STR_e "\0"
#define STRING_Meetei_Mayek0 STR_M STR_e STR_e STR_t STR_e STR_i STR_UNDERSCORE STR_M STR_a STR_y STR_e STR_k "\0"
@@ -260,8 +291,10 @@ const char _pcre_utt_names[] =
STRING_Avestan0
STRING_Balinese0
STRING_Bamum0
STRING_Batak0
STRING_Bengali0
STRING_Bopomofo0
STRING_Brahmi0
STRING_Braille0
STRING_Buginese0
STRING_Buhid0
@@ -323,6 +356,7 @@ const char _pcre_utt_names[] =
STRING_Lydian0
STRING_M0
STRING_Malayalam0
STRING_Mandaic0
STRING_Mc0
STRING_Me0
STRING_Meetei_Mayek0
@@ -397,135 +431,138 @@ const ucp_type_table _pcre_utt[] = {
{ 20, PT_SC, ucp_Avestan },
{ 28, PT_SC, ucp_Balinese },
{ 37, PT_SC, ucp_Bamum },
{ 43, PT_SC, ucp_Bengali },
{ 51, PT_SC, ucp_Bopomofo },
{ 60, PT_SC, ucp_Braille },
{ 68, PT_SC, ucp_Buginese },
{ 77, PT_SC, ucp_Buhid },
{ 83, PT_GC, ucp_C },
{ 85, PT_SC, ucp_Canadian_Aboriginal },
{ 105, PT_SC, ucp_Carian },
{ 112, PT_PC, ucp_Cc },
{ 115, PT_PC, ucp_Cf },
{ 118, PT_SC, ucp_Cham },
{ 123, PT_SC, ucp_Cherokee },
{ 132, PT_PC, ucp_Cn },
{ 135, PT_PC, ucp_Co },
{ 138, PT_SC, ucp_Common },
{ 145, PT_SC, ucp_Coptic },
{ 152, PT_PC, ucp_Cs },
{ 155, PT_SC, ucp_Cuneiform },
{ 165, PT_SC, ucp_Cypriot },
{ 173, PT_SC, ucp_Cyrillic },
{ 182, PT_SC, ucp_Deseret },
{ 190, PT_SC, ucp_Devanagari },
{ 201, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 222, PT_SC, ucp_Ethiopic },
{ 231, PT_SC, ucp_Georgian },
{ 240, PT_SC, ucp_Glagolitic },
{ 251, PT_SC, ucp_Gothic },
{ 258, PT_SC, ucp_Greek },
{ 264, PT_SC, ucp_Gujarati },
{ 273, PT_SC, ucp_Gurmukhi },
{ 282, PT_SC, ucp_Han },
{ 286, PT_SC, ucp_Hangul },
{ 293, PT_SC, ucp_Hanunoo },
{ 301, PT_SC, ucp_Hebrew },
{ 308, PT_SC, ucp_Hiragana },
{ 317, PT_SC, ucp_Imperial_Aramaic },
{ 334, PT_SC, ucp_Inherited },
{ 344, PT_SC, ucp_Inscriptional_Pahlavi },
{ 366, PT_SC, ucp_Inscriptional_Parthian },
{ 389, PT_SC, ucp_Javanese },
{ 398, PT_SC, ucp_Kaithi },
{ 405, PT_SC, ucp_Kannada },
{ 413, PT_SC, ucp_Katakana },
{ 422, PT_SC, ucp_Kayah_Li },
{ 431, PT_SC, ucp_Kharoshthi },
{ 442, PT_SC, ucp_Khmer },
{ 448, PT_GC, ucp_L },
{ 450, PT_LAMP, 0 },
{ 453, PT_SC, ucp_Lao },
{ 457, PT_SC, ucp_Latin },
{ 463, PT_SC, ucp_Lepcha },
{ 470, PT_SC, ucp_Limbu },
{ 476, PT_SC, ucp_Linear_B },
{ 485, PT_SC, ucp_Lisu },
{ 490, PT_PC, ucp_Ll },
{ 493, PT_PC, ucp_Lm },
{ 496, PT_PC, ucp_Lo },
{ 499, PT_PC, ucp_Lt },
{ 502, PT_PC, ucp_Lu },
{ 505, PT_SC, ucp_Lycian },
{ 512, PT_SC, ucp_Lydian },
{ 519, PT_GC, ucp_M },
{ 521, PT_SC, ucp_Malayalam },
{ 531, PT_PC, ucp_Mc },
{ 534, PT_PC, ucp_Me },
{ 537, PT_SC, ucp_Meetei_Mayek },
{ 550, PT_PC, ucp_Mn },
{ 553, PT_SC, ucp_Mongolian },
{ 563, PT_SC, ucp_Myanmar },
{ 571, PT_GC, ucp_N },
{ 573, PT_PC, ucp_Nd },
{ 576, PT_SC, ucp_New_Tai_Lue },
{ 588, PT_SC, ucp_Nko },
{ 592, PT_PC, ucp_Nl },
{ 595, PT_PC, ucp_No },
{ 598, PT_SC, ucp_Ogham },
{ 604, PT_SC, ucp_Ol_Chiki },
{ 613, PT_SC, ucp_Old_Italic },
{ 624, PT_SC, ucp_Old_Persian },
{ 636, PT_SC, ucp_Old_South_Arabian },
{ 654, PT_SC, ucp_Old_Turkic },
{ 665, PT_SC, ucp_Oriya },
{ 671, PT_SC, ucp_Osmanya },
{ 679, PT_GC, ucp_P },
{ 681, PT_PC, ucp_Pc },
{ 684, PT_PC, ucp_Pd },
{ 687, PT_PC, ucp_Pe },
{ 690, PT_PC, ucp_Pf },
{ 693, PT_SC, ucp_Phags_Pa },
{ 702, PT_SC, ucp_Phoenician },
{ 713, PT_PC, ucp_Pi },
{ 716, PT_PC, ucp_Po },
{ 719, PT_PC, ucp_Ps },
{ 722, PT_SC, ucp_Rejang },
{ 729, PT_SC, ucp_Runic },
{ 735, PT_GC, ucp_S },
{ 737, PT_SC, ucp_Samaritan },
{ 747, PT_SC, ucp_Saurashtra },
{ 758, PT_PC, ucp_Sc },
{ 761, PT_SC, ucp_Shavian },
{ 769, PT_SC, ucp_Sinhala },
{ 777, PT_PC, ucp_Sk },
{ 780, PT_PC, ucp_Sm },
{ 783, PT_PC, ucp_So },
{ 786, PT_SC, ucp_Sundanese },
{ 796, PT_SC, ucp_Syloti_Nagri },
{ 809, PT_SC, ucp_Syriac },
{ 816, PT_SC, ucp_Tagalog },
{ 824, PT_SC, ucp_Tagbanwa },
{ 833, PT_SC, ucp_Tai_Le },
{ 840, PT_SC, ucp_Tai_Tham },
{ 849, PT_SC, ucp_Tai_Viet },
{ 858, PT_SC, ucp_Tamil },
{ 864, PT_SC, ucp_Telugu },
{ 871, PT_SC, ucp_Thaana },
{ 878, PT_SC, ucp_Thai },
{ 883, PT_SC, ucp_Tibetan },
{ 891, PT_SC, ucp_Tifinagh },
{ 900, PT_SC, ucp_Ugaritic },
{ 909, PT_SC, ucp_Vai },
{ 913, PT_ALNUM, 0 },
{ 917, PT_PXSPACE, 0 },
{ 921, PT_SPACE, 0 },
{ 925, PT_WORD, 0 },
{ 929, PT_SC, ucp_Yi },
{ 932, PT_GC, ucp_Z },
{ 934, PT_PC, ucp_Zl },
{ 937, PT_PC, ucp_Zp },
{ 940, PT_PC, ucp_Zs }
{ 43, PT_SC, ucp_Batak },
{ 49, PT_SC, ucp_Bengali },
{ 57, PT_SC, ucp_Bopomofo },
{ 66, PT_SC, ucp_Brahmi },
{ 73, PT_SC, ucp_Braille },
{ 81, PT_SC, ucp_Buginese },
{ 90, PT_SC, ucp_Buhid },
{ 96, PT_GC, ucp_C },
{ 98, PT_SC, ucp_Canadian_Aboriginal },
{ 118, PT_SC, ucp_Carian },
{ 125, PT_PC, ucp_Cc },
{ 128, PT_PC, ucp_Cf },
{ 131, PT_SC, ucp_Cham },
{ 136, PT_SC, ucp_Cherokee },
{ 145, PT_PC, ucp_Cn },
{ 148, PT_PC, ucp_Co },
{ 151, PT_SC, ucp_Common },
{ 158, PT_SC, ucp_Coptic },
{ 165, PT_PC, ucp_Cs },
{ 168, PT_SC, ucp_Cuneiform },
{ 178, PT_SC, ucp_Cypriot },
{ 186, PT_SC, ucp_Cyrillic },
{ 195, PT_SC, ucp_Deseret },
{ 203, PT_SC, ucp_Devanagari },
{ 214, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 235, PT_SC, ucp_Ethiopic },
{ 244, PT_SC, ucp_Georgian },
{ 253, PT_SC, ucp_Glagolitic },
{ 264, PT_SC, ucp_Gothic },
{ 271, PT_SC, ucp_Greek },
{ 277, PT_SC, ucp_Gujarati },
{ 286, PT_SC, ucp_Gurmukhi },
{ 295, PT_SC, ucp_Han },
{ 299, PT_SC, ucp_Hangul },
{ 306, PT_SC, ucp_Hanunoo },
{ 314, PT_SC, ucp_Hebrew },
{ 321, PT_SC, ucp_Hiragana },
{ 330, PT_SC, ucp_Imperial_Aramaic },
{ 347, PT_SC, ucp_Inherited },
{ 357, PT_SC, ucp_Inscriptional_Pahlavi },
{ 379, PT_SC, ucp_Inscriptional_Parthian },
{ 402, PT_SC, ucp_Javanese },
{ 411, PT_SC, ucp_Kaithi },
{ 418, PT_SC, ucp_Kannada },
{ 426, PT_SC, ucp_Katakana },
{ 435, PT_SC, ucp_Kayah_Li },
{ 444, PT_SC, ucp_Kharoshthi },
{ 455, PT_SC, ucp_Khmer },
{ 461, PT_GC, ucp_L },
{ 463, PT_LAMP, 0 },
{ 466, PT_SC, ucp_Lao },
{ 470, PT_SC, ucp_Latin },
{ 476, PT_SC, ucp_Lepcha },
{ 483, PT_SC, ucp_Limbu },
{ 489, PT_SC, ucp_Linear_B },
{ 498, PT_SC, ucp_Lisu },
{ 503, PT_PC, ucp_Ll },
{ 506, PT_PC, ucp_Lm },
{ 509, PT_PC, ucp_Lo },
{ 512, PT_PC, ucp_Lt },
{ 515, PT_PC, ucp_Lu },
{ 518, PT_SC, ucp_Lycian },
{ 525, PT_SC, ucp_Lydian },
{ 532, PT_GC, ucp_M },
{ 534, PT_SC, ucp_Malayalam },
{ 544, PT_SC, ucp_Mandaic },
{ 552, PT_PC, ucp_Mc },
{ 555, PT_PC, ucp_Me },
{ 558, PT_SC, ucp_Meetei_Mayek },
{ 571, PT_PC, ucp_Mn },
{ 574, PT_SC, ucp_Mongolian },
{ 584, PT_SC, ucp_Myanmar },
{ 592, PT_GC, ucp_N },
{ 594, PT_PC, ucp_Nd },
{ 597, PT_SC, ucp_New_Tai_Lue },
{ 609, PT_SC, ucp_Nko },
{ 613, PT_PC, ucp_Nl },
{ 616, PT_PC, ucp_No },
{ 619, PT_SC, ucp_Ogham },
{ 625, PT_SC, ucp_Ol_Chiki },
{ 634, PT_SC, ucp_Old_Italic },
{ 645, PT_SC, ucp_Old_Persian },
{ 657, PT_SC, ucp_Old_South_Arabian },
{ 675, PT_SC, ucp_Old_Turkic },
{ 686, PT_SC, ucp_Oriya },
{ 692, PT_SC, ucp_Osmanya },
{ 700, PT_GC, ucp_P },
{ 702, PT_PC, ucp_Pc },
{ 705, PT_PC, ucp_Pd },
{ 708, PT_PC, ucp_Pe },
{ 711, PT_PC, ucp_Pf },
{ 714, PT_SC, ucp_Phags_Pa },
{ 723, PT_SC, ucp_Phoenician },
{ 734, PT_PC, ucp_Pi },
{ 737, PT_PC, ucp_Po },
{ 740, PT_PC, ucp_Ps },
{ 743, PT_SC, ucp_Rejang },
{ 750, PT_SC, ucp_Runic },
{ 756, PT_GC, ucp_S },
{ 758, PT_SC, ucp_Samaritan },
{ 768, PT_SC, ucp_Saurashtra },
{ 779, PT_PC, ucp_Sc },
{ 782, PT_SC, ucp_Shavian },
{ 790, PT_SC, ucp_Sinhala },
{ 798, PT_PC, ucp_Sk },
{ 801, PT_PC, ucp_Sm },
{ 804, PT_PC, ucp_So },
{ 807, PT_SC, ucp_Sundanese },
{ 817, PT_SC, ucp_Syloti_Nagri },
{ 830, PT_SC, ucp_Syriac },
{ 837, PT_SC, ucp_Tagalog },
{ 845, PT_SC, ucp_Tagbanwa },
{ 854, PT_SC, ucp_Tai_Le },
{ 861, PT_SC, ucp_Tai_Tham },
{ 870, PT_SC, ucp_Tai_Viet },
{ 879, PT_SC, ucp_Tamil },
{ 885, PT_SC, ucp_Telugu },
{ 892, PT_SC, ucp_Thaana },
{ 899, PT_SC, ucp_Thai },
{ 904, PT_SC, ucp_Tibetan },
{ 912, PT_SC, ucp_Tifinagh },
{ 921, PT_SC, ucp_Ugaritic },
{ 930, PT_SC, ucp_Vai },
{ 934, PT_ALNUM, 0 },
{ 938, PT_PXSPACE, 0 },
{ 942, PT_SPACE, 0 },
{ 946, PT_WORD, 0 },
{ 950, PT_SC, ucp_Yi },
{ 953, PT_GC, ucp_Z },
{ 955, PT_PC, ucp_Zl },
{ 958, PT_PC, ucp_Zp },
{ 961, PT_PC, ucp_Zs }
};
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);

File diff suppressed because it is too large Load Diff

View File

@@ -54,42 +54,56 @@ strings. */
*************************************************/
/* This function is called (optionally) at the start of compile or match, to
validate that a supposed UTF-8 string is actually valid. The early check means
check that a supposed UTF-8 string is actually valid. The early check means
that subsequent code can assume it is dealing with a valid string. The check
can be turned off for maximum performance, but the consequences of supplying
an invalid string are then undefined.
can be turned off for maximum performance, but the consequences of supplying an
invalid string are then undefined.
Originally, this function checked according to RFC 2279, allowing for values in
the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
the canonical format. Once somebody had pointed out RFC 3629 to me (it
obsoletes 2279), additional restrictions were applied. The values are now
limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
subrange 0xd000 to 0xdfff is excluded.
subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
characters is still checked.
From release 8.13 more information about the details of the error are passed
back in the returned value:
PCRE_UTF8_ERR0 No error
PCRE_UTF8_ERR1 Missing 1 byte at the end of the string
PCRE_UTF8_ERR2 Missing 2 bytes at the end of the string
PCRE_UTF8_ERR3 Missing 3 bytes at the end of the string
PCRE_UTF8_ERR4 Missing 4 bytes at the end of the string
PCRE_UTF8_ERR5 Missing 5 bytes at the end of the string
PCRE_UTF8_ERR6 2nd-byte's two top bits are not 0x80
PCRE_UTF8_ERR7 3rd-byte's two top bits are not 0x80
PCRE_UTF8_ERR8 4th-byte's two top bits are not 0x80
PCRE_UTF8_ERR9 5th-byte's two top bits are not 0x80
PCRE_UTF8_ERR10 6th-byte's two top bits are not 0x80
PCRE_UTF8_ERR11 5-byte character is not permitted by RFC 3629
PCRE_UTF8_ERR12 6-byte character is not permitted by RFC 3629
PCRE_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted
PCRE_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted
PCRE_UTF8_ERR15 Overlong 2-byte sequence
PCRE_UTF8_ERR16 Overlong 3-byte sequence
PCRE_UTF8_ERR17 Overlong 4-byte sequence
PCRE_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur)
PCRE_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur)
PCRE_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character)
PCRE_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
Arguments:
string points to the string
length length of string, or -1 if the string is zero-terminated
errp pointer to an error position offset variable
Returns: < 0 if the string is a valid UTF-8 string
>= 0 otherwise; the value is the offset of the bad byte
Bad bytes can be:
. An isolated byte whose most significant bits are 0x80, because this
can only correctly appear within a UTF-8 character;
. A byte whose most significant bits are 0xc0, but whose other bits indicate
that there are more than 3 additional bytes (i.e. an RFC 2279 starting
byte, which is no longer valid under RFC 3629);
.
The returned offset may also be equal to the length of the string; this means
that one or more bytes is missing from the final UTF-8 character.
Returns: = 0 if the string is a valid UTF-8 string
> 0 otherwise, setting the offset of the bad character
*/
int
_pcre_valid_utf8(USPTR string, int length)
_pcre_valid_utf8(USPTR string, int length, int *erroroffset)
{
#ifdef SUPPORT_UTF8
register USPTR p;
@@ -97,84 +111,189 @@ register USPTR p;
if (length < 0)
{
for (p = string; *p != 0; p++);
length = p - string;
length = (int)(p - string);
}
for (p = string; length-- > 0; p++)
{
register int ab;
register int c = *p;
if (c < 128) continue;
if (c < 0xc0) return p - string;
register int ab, c, d;
c = *p;
if (c < 128) continue; /* ASCII character */
if (c < 0xc0) /* Isolated 10xx xxxx byte */
{
*erroroffset = (int)(p - string);
return PCRE_UTF8_ERR20;
}
if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
{
*erroroffset = (int)(p - string);
return PCRE_UTF8_ERR21;
}
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
if (ab > 3) return p - string; /* Too many for RFC 3629 */
if (length < ab) return p + 1 + length - string; /* Missing bytes */
length -= ab;
if (length < ab)
{
*erroroffset = (int)(p - string); /* Missing bytes */
return ab - length; /* Codes ERR1 to ERR5 */
}
length -= ab; /* Length remaining */
/* Check top bits in the second byte */
if ((*(++p) & 0xc0) != 0x80) return p - string;
/* Check for overlong sequences for each different length, and for the
excluded range 0xd000 to 0xdfff. */
if (((d = *(++p)) & 0xc0) != 0x80)
{
*erroroffset = (int)(p - string) - 1;
return PCRE_UTF8_ERR6;
}
/* For each length, check that the remaining bytes start with the 0x80 bit
set and not the 0x40 bit. Then check for an overlong sequence, and for the
excluded range 0xd800 to 0xdfff. */
switch (ab)
{
/* Check for xx00 000x (overlong sequence) */
/* 2-byte character. No further bytes to check for 0x80. Check first byte
for for xx00 000x (overlong sequence). */
case 1:
if ((c & 0x3e) == 0) return p - string;
continue; /* We know there aren't any more bytes to check */
case 1: if ((c & 0x3e) == 0)
{
*erroroffset = (int)(p - string) - 1;
return PCRE_UTF8_ERR15;
}
break;
/* Check for 1110 0000, xx0x xxxx (overlong sequence) or
1110 1101, 1010 xxxx (0xd000 - 0xdfff) */
/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
for 1110 0000, xx0x xxxx (overlong sequence) or
1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
case 2:
if ((c == 0xe0 && (*p & 0x20) == 0) ||
(c == 0xed && *p >= 0xa0))
return p - string;
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR7;
}
if (c == 0xe0 && (d & 0x20) == 0)
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR16;
}
if (c == 0xed && d >= 0xa0)
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR14;
}
break;
/* Check for 1111 0000, xx00 xxxx (overlong sequence) or
greater than 0x0010ffff (f4 8f bf bf) */
/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a
character greater than 0x0010ffff (f4 8f bf bf) */
case 3:
if ((c == 0xf0 && (*p & 0x30) == 0) ||
(c > 0xf4 ) ||
(c == 0xf4 && *p > 0x8f))
return p - string;
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
return PCRE_UTF8_ERR8;
}
if (c == 0xf0 && (d & 0x30) == 0)
{
*erroroffset = (int)(p - string) - 3;
return PCRE_UTF8_ERR17;
}
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
{
*erroroffset = (int)(p - string) - 3;
return PCRE_UTF8_ERR13;
}
break;
#if 0
/* These cases can no longer occur, as we restrict to a maximum of four
bytes nowadays. Leave the code here in case we ever want to add an option
for longer sequences. */
/* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be
rejected by the length test below. However, we do the appropriate tests
here so that overlong sequences get diagnosed, and also in case there is
ever an option for handling these larger code points. */
/* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for
1111 1000, xx00 0xxx */
/* Check for 1111 1000, xx00 0xxx */
case 4:
if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
return PCRE_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
*erroroffset = (int)(p - string) - 4;
return PCRE_UTF8_ERR9;
}
if (c == 0xf8 && (d & 0x38) == 0)
{
*erroroffset = (int)(p - string) - 4;
return PCRE_UTF8_ERR18;
}
break;
/* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
/* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for
1111 1100, xx00 00xx. */
case 5:
if (c == 0xfe || c == 0xff ||
(c == 0xfc && (*p & 0x3c) == 0)) return p - string;
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
return PCRE_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
*erroroffset = (int)(p - string) - 4;
return PCRE_UTF8_ERR9;
}
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
{
*erroroffset = (int)(p - string) - 5;
return PCRE_UTF8_ERR10;
}
if (c == 0xfc && (d & 0x3c) == 0)
{
*erroroffset = (int)(p - string) - 5;
return PCRE_UTF8_ERR19;
}
break;
#endif
}
/* Check for valid bytes after the 2nd, if any; all must start 10 */
while (--ab > 0)
/* Character is valid under RFC 2279, but 4-byte and 5-byte characters are
excluded by RFC 3629. The pointer p is currently at the last byte of the
character. */
if (ab > 3)
{
if ((*(++p) & 0xc0) != 0x80) return p - string;
*erroroffset = (int)(p - string) - ab;
return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12;
}
}
#else
#else /* SUPPORT_UTF8 */
(void)(string); /* Keep picky compilers happy */
(void)(length);
#endif
return -1;
return PCRE_UTF8_ERR0; /* This indicates success */
}
/* End of pcre_valid_utf8.c */

View File

@@ -153,7 +153,11 @@ enum {
ucp_Old_Turkic,
ucp_Samaritan,
ucp_Tai_Tham,
ucp_Tai_Viet
ucp_Tai_Viet,
/* New for Unicode 6.0.0: */
ucp_Batak,
ucp_Brahmi,
ucp_Mandaic
};
#endif