c:\harbour\source\hbpcre
pcrecomp.c |
Type | Function | Source | Line |
STATIC CONST CHAR * | find_error_text(int n)
static const char *
find_error_text(int n)
{
const char *s = error_texts;
for (; n > 0; n--) while (*s++ != 0);
return s;
}
/*************************************************
* Handle escapes *
*************************************************/
| pcrecomp.c | 454 |
STATIC INT | check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, int options, BOOL isclass)
static int
check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
int options, BOOL isclass)
{
BOOL utf8 = (options & PCRE_UTF8) != 0;
const uschar *ptr = *ptrptr + 1;
int c, i;
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
ptr--; /* Set pointer back to the last byte */
/* If backslash is at the end of the pattern, it's an error. */
if (c == 0) *errorcodeptr = ERR1;
/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
in a table. A non-zero result is something that can be returned immediately.
Otherwise further processing may be required. */
#ifndef EBCDIC /* ASCII coding */
else if (c < '0' || c > 'z') {} /* Not alphanumeric */
else if ((i = escapes[c - '0']) != 0) c = i;
#else /* EBCDIC coding */
else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
else if ((i = escapes[c - 0x48]) != 0) c = i;
#endif
/* Escapes that need further processing, or are illegal. */
else
{
const uschar *oldptr;
BOOL braced, negated;
switch (c)
{
/* A number of Perl escapes are not handled by PCRE. We give an explicit
error. */
case 'l':
case 'L':
case 'N':
case 'u':
case 'U':
*errorcodeptr = ERR37;
break;
/* \g must be followed by one of a number of specific things:
(1) A number, either plain or braced. If positive, it is an absolute
backreference. If negative, it is a relative backreference. This is a Perl
5.10 feature.
(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
is part of Perl's movement towards a unified syntax for back references. As
this is synonymous with \k{name}, we fudge it up by pretending it really
was \k.
(3) For Oniguruma compatibility we also support \g followed by a name or a
number either in angle brackets or in single quotes. However, these are
(possibly recursive) subroutine calls, _not_ backreferences. Just return
the -ESC_g code (cf \k). */
case 'g':
if (ptr[1] == '<' || ptr[1] == '\'')
{
c = -ESC_g;
break;
}
/* Handle the Perl-compatible cases */
if (ptr[1] == '{')
{
const uschar *p;
for (p = ptr+2; *p != 0 && *p != '}'; p++)
if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
if (*p != 0 && *p != '}')
{
c = -ESC_k;
break;
}
braced = TRUE;
ptr++;
}
else braced = FALSE;
if (ptr[1] == '-')
{
negated = TRUE;
ptr++;
}
else negated = FALSE;
c = 0;
while ((digitab[ptr[1]] & ctype_digit) != 0)
c = c * 10 + *(++ptr) - '0';
if (c < 0) /* Integer overflow */
{
*errorcodeptr = ERR61;
break;
}
if (braced && *(++ptr) != '}')
{
*errorcodeptr = ERR57;
break;
}
if (c == 0)
{
*errorcodeptr = ERR58;
break;
}
if (negated)
{
if (c > bracount)
{
*errorcodeptr = ERR15;
break;
}
c = bracount - (c - 1);
}
c = -(ESC_REF + c);
break;
/* The handling of escape sequences consisting of a string of digits
starting with one that is not zero is not straightforward. By experiment,
the way Perl works seems to be as follows:
Outside a character class, the digits are read as a decimal number. If the
number is less than 10, or if there are that many previous extracting
left brackets, then it is a back reference. Otherwise, up to three octal
digits are read to form an escaped byte. Thus \123 is likely to be octal
123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
value is greater than 377, the least significant 8 bits are taken. Inside a
character class, \ followed by a digit is always an octal number. */
case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9':
if (!isclass)
{
oldptr = ptr;
c -= '0';
while ((digitab[ptr[1]] & ctype_digit) != 0)
c = c * 10 + *(++ptr) - '0';
if (c < 0) /* Integer overflow */
{
*errorcodeptr = ERR61;
break;
}
if (c < 10 || c <= bracount)
{
c = -(ESC_REF + c);
break;
}
ptr = oldptr; /* Put the pointer back and fall through */
}
/* Handle an octal number following \. If the first digit is 8 or 9, Perl
generates a binary zero byte and treats the digit as a following literal.
Thus we have to pull back the pointer by one. */
if ((c = *ptr) >= '8')
{
ptr--;
c = 0;
break;
}
/* \0 always starts an octal number, but we may drop through to here with a
larger first octal digit. The original code used just to take the least
significant 8 bits of octal numbers (I think this is what early Perls used
to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
than 3 octal digits. */
case '0':
c -= '0';
while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
c = c * 8 + *(++ptr) - '0';
if (!utf8 && c > 255) *errorcodeptr = ERR51;
break;
/* \x is complicated. \x{ddd} is a character number which can be greater
than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
treated as a data character. */
case 'x':
if (ptr[1] == '{')
{
const uschar *pt = ptr + 2;
int count = 0;
c = 0;
while ((digitab[*pt] & ctype_xdigit) != 0)
{
register int cc = *pt++;
if (c == 0 && cc == '0') continue; /* Leading zeroes */
count++;
#ifndef EBCDIC /* ASCII coding */
if (cc >= 'a') cc -= 32; /* Convert to upper case */
c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
#else /* EBCDIC coding */
if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
#endif
}
if (*pt == '}')
{
if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
ptr = pt;
break;
}
/* If the sequence of hex digits does not end with '}', then we don't
recognize this construct; fall through to the normal \x handling. */
}
/* Read just a single-byte hex-defined char */
c = 0;
while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
{
int cc; /* Some compilers don't like ++ */
cc = *(++ptr); /* in initializers */
#ifndef EBCDIC /* ASCII coding */
if (cc >= 'a') cc -= 32; /* Convert to upper case */
c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
#else /* EBCDIC coding */
if (cc <= 'z') cc += 64; /* Convert to upper case */
c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
#endif
}
break;
/* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
This coding is ASCII-specific, but then the whole concept of \cx is
ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
case 'c':
c = *(++ptr);
if (c == 0)
{
*errorcodeptr = ERR2;
break;
}
#ifndef EBCDIC /* ASCII coding */
if (c >= 'a' && c <= 'z') c -= 32;
c ^= 0x40;
#else /* EBCDIC coding */
if (c >= 'a' && c <= 'z') c += 64;
c ^= 0xC0;
#endif
break;
/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
other alphanumeric following \ is an error if PCRE_EXTRA was set;
otherwise, for Perl compatibility, it is a literal. This code looks a bit
odd, but there used to be some cases other than the default, and there may
be again in future, so I haven't "optimized" it. */
default:
if ((options & PCRE_EXTRA) != 0) switch(c)
{
default:
*errorcodeptr = ERR3;
break;
}
break;
}
}
*ptrptr = ptr;
return c;
}
#ifdef SUPPORT_UCP
/*************************************************
* Handle \P and \p *
*************************************************/
| pcrecomp.c | 487 |
STATIC INT | get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
static int
get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
{
int c, i, bot, top;
const uschar *ptr = *ptrptr;
char name[32];
c = *(++ptr);
if (c == 0) goto ERROR_RETURN;
*negptr = FALSE;
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
negation. */
if (c == '{')
{
if (ptr[1] == '^')
{
*negptr = TRUE;
ptr++;
}
for (i = 0; i < (int)sizeof(name) - 1; i++)
{
c = *(++ptr);
if (c == 0) goto ERROR_RETURN;
if (c == '}') break;
name[i] = c;
}
if (c !='}') goto ERROR_RETURN;
name[i] = 0;
}
/* Otherwise there is just one following character */
else
{
name[0] = c;
name[1] = 0;
}
*ptrptr = ptr;
/* Search for a recognized property name using binary chop */
bot = 0;
top = _pcre_utt_size;
while (bot < top)
{
i = (bot + top) >> 1;
c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
if (c == 0)
{
*dptr = _pcre_utt[i].value;
return _pcre_utt[i].type;
}
if (c > 0) bot = i + 1; else top = i;
}
*errorcodeptr = ERR47;
*ptrptr = ptr;
return -1;
ERROR_RETURN:
*errorcodeptr = ERR46;
*ptrptr = ptr;
return -1;
}
#endif
/*************************************************
* Check for counted repeat *
*************************************************/
| pcrecomp.c | 792 |
STATIC BOOL | is_counted_repeat(const uschar *p)
static BOOL
is_counted_repeat(const uschar *p)
{
if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
while ((digitab[*p] & ctype_digit) != 0) p++;
if (*p == '}') return TRUE;
if (*p++ != ',') return FALSE;
if (*p == '}') return TRUE;
if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
while ((digitab[*p] & ctype_digit) != 0) p++;
return (*p == '}');
}
/*************************************************
* Read repeat counts *
*************************************************/
| pcrecomp.c | 881 |
STATIC CONST USCHAR * | read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
static const uschar *
read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
{
int min = 0;
int max = -1;
/* Read the minimum value and do a paranoid check: a negative value indicates
an integer overflow. */
while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
if (min < 0 || min > 65535)
{
*errorcodeptr = ERR5;
return p;
}
/* Read the maximum value if there is one, and again do a paranoid on its size.
Also, max must not be less than min. */
if (*p == '}') max = min; else
{
if (*(++p) != '}')
{
max = 0;
while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
if (max < 0 || max > 65535)
{
*errorcodeptr = ERR5;
return p;
}
if (max < min)
{
*errorcodeptr = ERR4;
return p;
}
}
}
/* Fill in the required variables, and pass back the pointer to the terminating
'}'. */
*minp = min;
*maxp = max;
return p;
}
/*************************************************
* Find forward referenced subpattern *
*************************************************/
| pcrecomp.c | 918 |
STATIC INT | find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn, BOOL xmode)
static int
find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
BOOL xmode)
{
const uschar *thisname;
int count = cd->bracount;
for (; *ptr != 0; ptr++)
{
int term;
/* Skip over backslashed characters and also entire \Q...\E */
if (*ptr == '\\')
{
if (*(++ptr) == 0) return -1;
if (*ptr == 'Q') for (;;)
{
while (*(++ptr) != 0 && *ptr != '\\');
if (*ptr == 0) return -1;
if (*(++ptr) == 'E') break;
}
continue;
}
/* Skip over character classes; this logic must be similar to the way they
are handled for real. If the first character is '^', skip it. Also, if the
first few characters (either before or after ^) are \Q\E or \E we skip them
too. This makes for compatibility with Perl. */
if (*ptr == '[')
{
BOOL negate_class = FALSE;
for (;;)
{
int c = *(++ptr);
if (c == '\\')
{
if (ptr[1] == 'E') ptr++;
else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
else break;
}
else if (!negate_class && c == '^')
negate_class = TRUE;
else break;
}
/* If the next character is ']', it is a data character that must be
skipped, except in JavaScript compatibility mode. */
if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
ptr++;
while (*(++ptr) != ']')
{
if (*ptr == 0) return -1;
if (*ptr == '\\')
{
if (*(++ptr) == 0) return -1;
if (*ptr == 'Q') for (;;)
{
while (*(++ptr) != 0 && *ptr != '\\');
if (*ptr == 0) return -1;
if (*(++ptr) == 'E') break;
}
continue;
}
}
continue;
}
/* Skip comments in /x mode */
if (xmode && *ptr == '#')
{
while (*(++ptr) != 0 && *ptr != '\n');
if (*ptr == 0) return -1;
continue;
}
/* An opening parens must now be a real metacharacter */
if (*ptr != '(') continue;
if (ptr[1] != '?' && ptr[1] != '*')
{
count++;
if (name == NULL && count == lorn) return count;
continue;
}
ptr += 2;
if (*ptr == 'P') ptr++; /* Allow optional P */
/* We have to disambiguate (? */
if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
*ptr != '\'')
continue;
count++;
if (name == NULL && count == lorn) return count;
term = *ptr++;
if (term == '<') term = '>';
thisname = ptr;
while (*ptr != term) ptr++;
if (name != NULL && lorn == ptr - thisname &&
strncmp((const char *)name, (const char *)thisname, lorn) == 0)
return count;
}
return -1;
}
/*************************************************
* Find first significant op code *
*************************************************/
| pcrecomp.c | 987 |
STATIC CONST USCHAR* | first_significant_code(const uschar *code, int *options, int optbit, BOOL skipassert)
static const uschar*
first_significant_code(const uschar *code, int *options, int optbit,
BOOL skipassert)
{
for (;;)
{
switch ((int)*code)
{
case OP_OPT:
if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
*options = (int)code[1];
code += 2;
break;
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
if (!skipassert) return code;
do code += GET(code, 1); while (*code == OP_ALT);
code += _pcre_OP_lengths[*code];
break;
case OP_WORD_BOUNDARY:
case OP_NOT_WORD_BOUNDARY:
if (!skipassert) return code;
/* Fall through */
case OP_CALLOUT:
case OP_CREF:
case OP_RREF:
case OP_DEF:
code += _pcre_OP_lengths[*code];
break;
default:
return code;
}
}
/* Control never reaches here */
}
/*************************************************
* Find the fixed length of a pattern *
*************************************************/
| pcrecomp.c | 1123 |
STATIC INT | find_fixedlength(uschar *code, int options)
static int
find_fixedlength(uschar *code, int options)
{
int length = -1;
register int branchlength = 0;
register uschar *cc = code + 1 + LINK_SIZE;
/* Scan along the opcodes for this branch. If we get to the end of the
branch, check the length against that of the other branches. */
for (;;)
{
int d;
register int op = *cc;
switch (op)
{
case OP_CBRA:
case OP_BRA:
case OP_ONCE:
case OP_COND:
d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
/* Reached end of a branch; if it's a ket it is the end of a nested
call. If it's ALT it is an alternation in a nested call. If it is
END it's the end of the outer call. All can be handled by the same code. */
case OP_ALT:
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_END:
if (length < 0) length = branchlength;
else if (length != branchlength) return -1;
if (*cc != OP_ALT) return length;
cc += 1 + LINK_SIZE;
branchlength = 0;
break;
/* Skip over assertive subpatterns */
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
do cc += GET(cc, 1); while (*cc == OP_ALT);
/* Fall through */
/* Skip over things that don't match chars */
case OP_REVERSE:
case OP_CREF:
case OP_RREF:
case OP_DEF:
case OP_OPT:
case OP_CALLOUT:
case OP_SOD:
case OP_SOM:
case OP_EOD:
case OP_EODN:
case OP_CIRC:
case OP_DOLL:
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
cc += _pcre_OP_lengths[*cc];
break;
/* Handle literal characters */
case OP_CHAR:
case OP_CHARNC:
case OP_NOT:
branchlength++;
cc += 2;
#ifdef SUPPORT_UTF8
if ((options & PCRE_UTF8) != 0)
{
while ((*cc & 0xc0) == 0x80) cc++;
}
#endif
break;
/* Handle exact repetitions. The count is already in characters, but we
need to skip over a multibyte character in UTF8 mode. */
case OP_EXACT:
branchlength += GET2(cc,1);
cc += 4;
#ifdef SUPPORT_UTF8
if ((options & PCRE_UTF8) != 0)
{
while((*cc & 0x80) == 0x80) cc++;
}
#endif
break;
case OP_TYPEEXACT:
branchlength += GET2(cc,1);
if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
cc += 4;
break;
/* Handle single-char matchers */
case OP_PROP:
case OP_NOTPROP:
cc += 2;
/* Fall through */
case OP_NOT_DIGIT:
case OP_DIGIT:
case OP_NOT_WHITESPACE:
case OP_WHITESPACE:
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_ANY:
case OP_ALLANY:
branchlength++;
cc++;
break;
/* The single-byte matcher isn't allowed */
case OP_ANYBYTE:
return -2;
/* Check a class for variable quantification */
#ifdef SUPPORT_UTF8
case OP_XCLASS:
cc += GET(cc, 1) - 33;
/* Fall through */
#endif
case OP_CLASS:
case OP_NCLASS:
cc += 33;
switch (*cc)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
return -1;
case OP_CRRANGE:
case OP_CRMINRANGE:
if (GET2(cc,1) != GET2(cc,3)) return -1;
branchlength += GET2(cc,1);
cc += 5;
break;
default:
branchlength++;
}
break;
/* Anything else is variable length */
default:
return -1;
}
}
/* Control never gets here */
}
/*************************************************
* Scan compiled regex for numbered bracket *
*************************************************/
| pcrecomp.c | 1183 |
STATIC CONST USCHAR * | find_bracket(const uschar *code, BOOL utf8, int number)
static const uschar *
find_bracket(const uschar *code, BOOL utf8, int number)
{
for (;;)
{
register int c = *code;
if (c == OP_END) return NULL;
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
the table is zero; the actual length is stored in the compiled code. */
if (c == OP_XCLASS) code += GET(code, 1);
/* Handle capturing bracket */
else if (c == OP_CBRA)
{
int n = GET2(code, 1+LINK_SIZE);
if (n == number) return (uschar *)code;
code += _pcre_OP_lengths[c];
}
/* Otherwise, we can get the item's length from the table, except that for
repeated character types, we have to test for \p and \P, which have an extra
two bytes of parameters. */
else
{
switch(c)
{
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSSTAR:
case OP_TYPEPOSPLUS:
case OP_TYPEPOSQUERY:
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
break;
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEEXACT:
case OP_TYPEPOSUPTO:
if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
break;
}
/* Add in the fixed length from the table */
code += _pcre_OP_lengths[c];
/* In UTF-8 mode, opcodes that are followed by a character may be followed by
a multi-byte character. The length in the table is a minimum, so we have to
arrange to skip the extra bytes. */
#ifdef SUPPORT_UTF8
if (utf8) switch(c)
{
case OP_CHAR:
case OP_CHARNC:
case OP_EXACT:
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
break;
}
#else
/* pacify warnings */
(void)(utf8);
#endif
}
}
}
/*************************************************
* Scan compiled regex for recursion reference *
*************************************************/
| pcrecomp.c | 1373 |
STATIC CONST USCHAR * | find_recurse(const uschar *code, BOOL utf8)
static const uschar *
find_recurse(const uschar *code, BOOL utf8)
{
for (;;)
{
register int c = *code;
if (c == OP_END) return NULL;
if (c == OP_RECURSE) return code;
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
the table is zero; the actual length is stored in the compiled code. */
if (c == OP_XCLASS) code += GET(code, 1);
/* Otherwise, we can get the item's length from the table, except that for
repeated character types, we have to test for \p and \P, which have an extra
two bytes of parameters. */
else
{
switch(c)
{
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSSTAR:
case OP_TYPEPOSPLUS:
case OP_TYPEPOSQUERY:
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
break;
case OP_TYPEPOSUPTO:
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEEXACT:
if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
break;
}
/* Add in the fixed length from the table */
code += _pcre_OP_lengths[c];
/* In UTF-8 mode, opcodes that are followed by a character may be followed
by a multi-byte character. The length in the table is a minimum, so we have
to arrange to skip the extra bytes. */
#ifdef SUPPORT_UTF8
if (utf8) switch(c)
{
case OP_CHAR:
case OP_CHARNC:
case OP_EXACT:
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
break;
}
#else
/* pacify warnings */
(void)(utf8);
#endif
}
}
}
/*************************************************
* Scan compiled branch for non-emptiness *
*************************************************/
| pcrecomp.c | 1477 |
STATIC BOOL | could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
static BOOL
could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
{
register int c;
for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
code < endcode;
code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
{
const uschar *ccode;
c = *code;
/* Skip over forward assertions; the other assertions are skipped by
first_significant_code() with a TRUE final argument. */
if (c == OP_ASSERT)
{
do code += GET(code, 1); while (*code == OP_ALT);
c = *code;
continue;
}
/* Groups with zero repeats can of course be empty; skip them. */
if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
{
code += _pcre_OP_lengths[c];
do code += GET(code, 1); while (*code == OP_ALT);
c = *code;
continue;
}
/* For other groups, scan the branches. */
if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
{
BOOL empty_branch;
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
/* Scan a closed bracket */
empty_branch = FALSE;
do
{
if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
empty_branch = TRUE;
code += GET(code, 1);
}
while (*code == OP_ALT);
if (!empty_branch) return FALSE; /* All branches are non-empty */
c = *code;
continue;
}
/* Handle the other opcodes */
switch (c)
{
/* Check for quantifiers after a class. XCLASS is used for classes that
cannot be represented just by a bit map. This includes negated single
high-valued characters. The length in _pcre_OP_lengths[] is zero; the
actual length is stored in the compiled code, so we must update "code"
here. */
#ifdef SUPPORT_UTF8
case OP_XCLASS:
ccode = code += GET(code, 1);
goto CHECK_CLASS_REPEAT;
#endif
case OP_CLASS:
case OP_NCLASS:
ccode = code + 33;
#ifdef SUPPORT_UTF8
CHECK_CLASS_REPEAT:
#endif
switch (*ccode)
{
case OP_CRSTAR: /* These could be empty; continue */
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
break;
default: /* Non-repeat => class must match */
case OP_CRPLUS: /* These repeats aren't empty */
case OP_CRMINPLUS:
return FALSE;
case OP_CRRANGE:
case OP_CRMINRANGE:
if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
break;
}
break;
/* Opcodes that must match a character */
case OP_PROP:
case OP_NOTPROP:
case OP_EXTUNI:
case OP_NOT_DIGIT:
case OP_DIGIT:
case OP_NOT_WHITESPACE:
case OP_WHITESPACE:
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_ANY:
case OP_ALLANY:
case OP_ANYBYTE:
case OP_CHAR:
case OP_CHARNC:
case OP_NOT:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
case OP_EXACT:
case OP_NOTPLUS:
case OP_NOTMINPLUS:
case OP_NOTPOSPLUS:
case OP_NOTEXACT:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEPOSPLUS:
case OP_TYPEEXACT:
return FALSE;
/* These are going to continue, as they may be empty, but we have to
fudge the length for the \p and \P cases. */
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPOSSTAR:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSQUERY:
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
break;
/* Same for these */
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
break;
/* End of branch */
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_ALT:
return TRUE;
/* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
MINUPTO, and POSUPTO may be followed by a multibyte character */
#ifdef SUPPORT_UTF8
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
break;
#endif
}
}
return TRUE;
}
/*************************************************
* Scan compiled regex for non-emptiness *
*************************************************/
| pcrecomp.c | 1579 |
STATIC BOOL | could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, BOOL utf8)
static BOOL
could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
BOOL utf8)
{
while (bcptr != NULL && bcptr->current >= code)
{
if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
bcptr = bcptr->outer;
}
return TRUE;
}
/*************************************************
* Check for POSIX class syntax *
*************************************************/
| pcrecomp.c | 1778 |
STATIC BOOL | check_posix_syntax(const uschar *ptr, const uschar **endptr)
static BOOL
check_posix_syntax(const uschar *ptr, const uschar **endptr)
{
int terminator; /* Don't combine these lines; the Solaris cc */
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
for (++ptr; *ptr != 0; ptr++)
{
if (*ptr == '\\' && ptr[1] == ']') ptr++; else
{
if (*ptr == ']') return FALSE;
if (*ptr == terminator && ptr[1] == ']')
{
*endptr = ptr;
return TRUE;
}
}
}
return FALSE;
}
/*************************************************
* Check POSIX class name *
*************************************************/
| pcrecomp.c | 1823 |
STATIC INT | check_posix_name(const uschar *ptr, int len)
static int
check_posix_name(const uschar *ptr, int len)
{
const char *pn = posix_names;
register int yield = 0;
while (posix_name_lengths[yield] != 0)
{
if (len == posix_name_lengths[yield] &&
strncmp((const char *)ptr, pn, len) == 0) return yield;
pn += posix_name_lengths[yield] + 1;
yield++;
}
return -1;
}
/*************************************************
* Adjust OP_RECURSE items in repeated group *
*************************************************/
| pcrecomp.c | 1860 |
STATIC VOID | adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, uschar *save_hwm)
static void
adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
uschar *save_hwm)
{
uschar *ptr = group;
while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
{
int offset;
uschar *hc;
/* See if this recursion is on the forward reference list. If so, adjust the
reference. */
for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
{
offset = GET(hc, 0);
if (cd->start_code + offset == ptr + 1)
{
PUT(hc, 0, offset + adjust);
break;
}
}
/* Otherwise, adjust the recursion offset if it's after the start of this
group. */
if (hc >= cd->hwm)
{
offset = GET(ptr, 1);
if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
}
ptr += 1 + LINK_SIZE;
}
}
/*************************************************
* Insert an automatic callout point *
*************************************************/
| pcrecomp.c | 1907 |
STATIC USCHAR * | auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
static uschar *
auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
{
*code++ = OP_CALLOUT;
*code++ = 255;
PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
PUT(code, LINK_SIZE, 0); /* Default length */
return code + 2*LINK_SIZE;
}
/*************************************************
* Complete a callout item *
*************************************************/
| pcrecomp.c | 1961 |
STATIC VOID | complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
static void
complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
{
int length = ptr - cd->start_pattern - GET(previous_callout, 2);
PUT(previous_callout, 2 + LINK_SIZE, length);
}
#ifdef SUPPORT_UCP
/*************************************************
* Get othercase range *
*************************************************/
| pcrecomp.c | 1989 |
STATIC BOOL | get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, unsigned int *odptr)
static BOOL
get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
unsigned int *odptr)
{
unsigned int c, othercase, next;
for (c = *cptr; c <= d; c++)
{ if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
if (c > d) return FALSE;
*ocptr = othercase;
next = othercase + 1;
for (++c; c <= d; c++)
{
if (_pcre_ucp_othercase(c) != next) break;
next++;
}
*odptr = next - 1;
*cptr = c;
return TRUE;
}
#endif /* SUPPORT_UCP */
/*************************************************
* Check if auto-possessifying is possible *
*************************************************/
| pcrecomp.c | 2017 |
STATIC BOOL | check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, const uschar *ptr, int options, compile_data *cd)
static BOOL
check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
const uschar *ptr, int options, compile_data *cd)
{
int next;
/* Skip whitespace and comments in extended mode */
if ((options & PCRE_EXTENDED) != 0)
{
for (;;)
{
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == '#')
{
while (*(++ptr) != 0)
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
}
else break;
}
}
/* If the next item is one that we can handle, get its value. A non-negative
value is a character, a negative value is an escape value. */
if (*ptr == '\\')
{
int temperrorcode = 0;
next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
if (temperrorcode != 0) return FALSE;
ptr++; /* Point after the escape sequence */
}
else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
{
#ifdef SUPPORT_UTF8
if (utf8) { GETCHARINC(next, ptr); } else
#endif
next = *ptr++;
}
else return FALSE;
/* Skip whitespace and comments in extended mode */
if ((options & PCRE_EXTENDED) != 0)
{
for (;;)
{
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == '#')
{
while (*(++ptr) != 0)
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
}
else break;
}
}
/* If the next thing is itself optional, we have to give up. */
if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
return FALSE;
/* Now compare the next item with the previous opcode. If the previous is a
positive single character match, "item" either contains the character or, if
"item" is greater than 127 in utf8 mode, the character's bytes are in
utf8_char. */
/* Handle cases when the next item is a character. */
if (next >= 0) switch(op_code)
{
case OP_CHAR:
#ifdef SUPPORT_UTF8
if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
#else
/* pacify warnings */
(void)(utf8_char);
#endif
return item != next;
/* For CHARNC (caseless character) we must check the other case. If we have
Unicode property support, we can use it to test the other case of
high-valued characters. */
case OP_CHARNC:
#ifdef SUPPORT_UTF8
if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
#endif
if (item == next) return FALSE;
#ifdef SUPPORT_UTF8
if (utf8)
{
unsigned int othercase;
if (next < 128) othercase = cd->fcc[next]; else
#ifdef SUPPORT_UCP
othercase = _pcre_ucp_othercase((unsigned int)next);
#else
othercase = NOTACHAR;
#endif
return (unsigned int)item != othercase;
}
else
#endif /* SUPPORT_UTF8 */
return (item != cd->fcc[next]); /* Non-UTF-8 mode */
/* For OP_NOT, "item" must be a single-byte character. */
case OP_NOT:
if (item == next) return TRUE;
if ((options & PCRE_CASELESS) == 0) return FALSE;
#ifdef SUPPORT_UTF8
if (utf8)
{
unsigned int othercase;
if (next < 128) othercase = cd->fcc[next]; else
#ifdef SUPPORT_UCP
othercase = _pcre_ucp_othercase(next);
#else
othercase = NOTACHAR;
#endif
return (unsigned int)item == othercase;
}
else
#endif /* SUPPORT_UTF8 */
return (item == cd->fcc[next]); /* Non-UTF-8 mode */
case OP_DIGIT:
return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
case OP_NOT_DIGIT:
return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
case OP_WHITESPACE:
return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
case OP_NOT_WHITESPACE:
return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
case OP_WORDCHAR:
return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
case OP_NOT_WORDCHAR:
return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
case OP_HSPACE:
case OP_NOT_HSPACE:
switch(next)
{
case 0x09:
case 0x20:
case 0xa0:
case 0x1680:
case 0x180e:
case 0x2000:
case 0x2001:
case 0x2002:
case 0x2003:
case 0x2004:
case 0x2005:
case 0x2006:
case 0x2007:
case 0x2008:
case 0x2009:
case 0x200A:
case 0x202f:
case 0x205f:
case 0x3000:
return op_code != OP_HSPACE;
default:
return op_code == OP_HSPACE;
}
case OP_VSPACE:
case OP_NOT_VSPACE:
switch(next)
{
case 0x0a:
case 0x0b:
case 0x0c:
case 0x0d:
case 0x85:
case 0x2028:
case 0x2029:
return op_code != OP_VSPACE;
default:
return op_code == OP_VSPACE;
}
default:
return FALSE;
}
/* Handle the case when the next item is \d, \s, etc. */
switch(op_code)
{
case OP_CHAR:
case OP_CHARNC:
#ifdef SUPPORT_UTF8
if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
#endif
switch(-next)
{
case ESC_d:
return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
case ESC_D:
return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
case ESC_s:
return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
case ESC_S:
return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
case ESC_w:
return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
case ESC_W:
return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
case ESC_h:
case ESC_H:
switch(item)
{
case 0x09:
case 0x20:
case 0xa0:
case 0x1680:
case 0x180e:
case 0x2000:
case 0x2001:
case 0x2002:
case 0x2003:
case 0x2004:
case 0x2005:
case 0x2006:
case 0x2007:
case 0x2008:
case 0x2009:
case 0x200A:
case 0x202f:
case 0x205f:
case 0x3000:
return -next != ESC_h;
default:
return -next == ESC_h;
}
case ESC_v:
case ESC_V:
switch(item)
{
case 0x0a:
case 0x0b:
case 0x0c:
case 0x0d:
case 0x85:
case 0x2028:
case 0x2029:
return -next != ESC_v;
default:
return -next == ESC_v;
}
default:
return FALSE;
}
case OP_DIGIT:
return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
next == -ESC_h || next == -ESC_v;
case OP_NOT_DIGIT:
return next == -ESC_d;
case OP_WHITESPACE:
return next == -ESC_S || next == -ESC_d || next == -ESC_w;
case OP_NOT_WHITESPACE:
return next == -ESC_s || next == -ESC_h || next == -ESC_v;
case OP_HSPACE:
return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
case OP_NOT_HSPACE:
return next == -ESC_h;
/* Can't have \S in here because VT matches \S (Perl anomaly) */
case OP_VSPACE:
return next == -ESC_V || next == -ESC_d || next == -ESC_w;
case OP_NOT_VSPACE:
return next == -ESC_v;
case OP_WORDCHAR:
return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
case OP_NOT_WORDCHAR:
return next == -ESC_w || next == -ESC_d;
default:
return FALSE;
}
/* Control does not reach here */
}
/*************************************************
* Compile one branch *
*************************************************/
| pcrecomp.c | 2066 |
STATIC BOOL | compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
static BOOL
compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
compile_data *cd, int *lengthptr)
{
int repeat_type, op_type;
int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
int bravalue = 0;
int greedy_default, greedy_non_default;
int firstbyte, reqbyte;
int zeroreqbyte, zerofirstbyte;
int req_caseopt, reqvary, tempreqvary;
int options = *optionsptr;
int after_manual_callout = 0;
int length_prevgroup = 0;
register int c;
register uschar *code = *codeptr;
uschar *last_code = code;
uschar *orig_code = code;
uschar *tempcode;
BOOL inescq = FALSE;
BOOL groupsetfirstbyte = FALSE;
const uschar *ptr = *ptrptr;
const uschar *tempptr;
uschar *previous = NULL;
uschar *previous_callout = NULL;
uschar *save_hwm = NULL;
uschar classbits[32];
#ifdef SUPPORT_UTF8
BOOL class_utf8;
BOOL utf8 = (options & PCRE_UTF8) != 0;
uschar *class_utf8data;
uschar *class_utf8data_base;
uschar utf8_char[6];
#else
BOOL utf8 = FALSE;
uschar *utf8_char = NULL;
#endif
#ifdef DEBUG
if (lengthptr != NULL) DPRINTF((">> start branch\n"));
#endif
/* Set up the default and non-default settings for greediness */
greedy_default = ((options & PCRE_UNGREEDY) != 0);
greedy_non_default = greedy_default ^ 1;
/* Initialize no first byte, no required byte. REQ_UNSET means "no char
matching encountered yet". It gets changed to REQ_NONE if we hit something that
matches a non-fixed char first char; reqbyte just remains unset if we never
find one.
When we hit a repeat whose minimum is zero, we may have to adjust these values
to take the zero repeat into account. This is implemented by setting them to
zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
item types that can be repeated set these backoff variables appropriately. */
firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
according to the current setting of the caseless flag. REQ_CASELESS is a bit
value > 255. It is added into the firstbyte or reqbyte variables to record the
case status of the value. This is used only for ASCII characters. */
req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
/* Switch on next character until the end of the branch */
for (;; ptr++)
{
BOOL negate_class;
BOOL should_flip_negation;
BOOL possessive_quantifier;
BOOL is_quantifier;
BOOL is_recurse;
BOOL reset_bracount;
int class_charcount;
int class_lastchar;
int newoptions;
int recno;
int refsign;
int skipbytes;
int subreqbyte;
int subfirstbyte;
int terminator;
int mclength;
uschar mcbuffer[8];
/* Get next byte in the pattern */
c = *ptr;
/* If we are in the pre-compile phase, accumulate the length used for the
previous cycle of this loop. */
if (lengthptr != NULL)
{
#ifdef DEBUG
if (code > cd->hwm) cd->hwm = code; /* High water info */
#endif
if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
{
*errorcodeptr = ERR52;
goto FAILED;
}
/* There is at least one situation where code goes backwards: this is the
case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
the class is simply eliminated. However, it is created first, so we have to
allow memory for it. Therefore, don't ever reduce the length at this point.
*/
if (code < last_code) code = last_code;
/* Paranoid check for integer overflow */
if (OFLOW_MAX - *lengthptr < code - last_code)
{
*errorcodeptr = ERR20;
goto FAILED;
}
*lengthptr += code - last_code;
DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
/* If "previous" is set and it is not at the start of the work space, move
it back to there, in order to avoid filling up the work space. Otherwise,
if "previous" is NULL, reset the current code pointer to the start. */
if (previous != NULL)
{
if (previous > orig_code)
{
memmove(orig_code, previous, code - previous);
code -= previous - orig_code;
previous = orig_code;
}
}
else code = orig_code;
/* Remember where this code item starts so we can pick up the length
next time round. */
last_code = code;
}
/* In the real compile phase, just check the workspace used by the forward
reference list. */
else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
{
*errorcodeptr = ERR52;
goto FAILED;
}
/* If in \Q...\E, check for the end; if not, we have a literal */
if (inescq && c != 0)
{
if (c == '\\' && ptr[1] == 'E')
{
inescq = FALSE;
ptr++;
continue;
}
else
{
if (previous_callout != NULL)
{
if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
complete_callout(previous_callout, ptr, cd);
previous_callout = NULL;
}
if ((options & PCRE_AUTO_CALLOUT) != 0)
{
previous_callout = code;
code = auto_callout(code, ptr, cd);
}
goto NORMAL_CHAR;
}
}
/* Fill in length of a previous callout, except when the next thing is
a quantifier. */
is_quantifier = c == '*' || c == '+' || c == '?' ||
(c == '{' && is_counted_repeat(ptr+1));
if (!is_quantifier && previous_callout != NULL &&
after_manual_callout-- <= 0)
{
if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
complete_callout(previous_callout, ptr, cd);
previous_callout = NULL;
}
/* In extended mode, skip white space and comments */
if ((options & PCRE_EXTENDED) != 0)
{
if ((cd->ctypes[c] & ctype_space) != 0) continue;
if (c == '#')
{
while (*(++ptr) != 0)
{
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
}
if (*ptr != 0) continue;
/* Else fall through to handle end of string */
c = 0;
}
}
/* No auto callout for quantifiers. */
if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
{
previous_callout = code;
code = auto_callout(code, ptr, cd);
}
switch(c)
{
/* ===================================================================*/
case 0: /* The branch terminates at string end */
case '|': /* or | or ) */
case ')':
*firstbyteptr = firstbyte;
*reqbyteptr = reqbyte;
*codeptr = code;
*ptrptr = ptr;
if (lengthptr != NULL)
{
if (OFLOW_MAX - *lengthptr < code - last_code)
{
*errorcodeptr = ERR20;
goto FAILED;
}
*lengthptr += code - last_code; /* To include callout length */
DPRINTF((">> end branch\n"));
}
return TRUE;
/* ===================================================================*/
/* Handle single-character metacharacters. In multiline mode, ^ disables
the setting of any following char as a first character. */
case '^':
if ((options & PCRE_MULTILINE) != 0)
{
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
}
previous = NULL;
*code++ = OP_CIRC;
break;
case '$':
previous = NULL;
*code++ = OP_DOLL;
break;
/* There can never be a first char if '.' is first, whatever happens about
repeats. The value of reqbyte doesn't change either. */
case '.':
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
zerofirstbyte = firstbyte;
zeroreqbyte = reqbyte;
previous = code;
*code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
break;
/* ===================================================================*/
/* Character classes. If the included characters are all < 256, we build a
32-byte bitmap of the permitted characters, except in the special case
where there is only one such character. For negated classes, we build the
map as usual, then invert it at the end. However, we use a different opcode
so that data characters > 255 can be handled correctly.
If the class contains characters outside the 0-255 range, a different
opcode is compiled. It may optionally have a bit map for characters < 256,
but those above are are explicitly listed afterwards. A flag byte tells
whether the bitmap is present, and whether this is a negated class or not.
In JavaScript compatibility mode, an isolated ']' causes an error. In
default (Perl) mode, it is treated as a data character. */
case ']':
if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
{
*errorcodeptr = ERR64;
goto FAILED;
}
goto NORMAL_CHAR;
case '[':
previous = code;
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
they are encountered at the top level, so we'll do that too. */
if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
check_posix_syntax(ptr, &tempptr))
{
*errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
goto FAILED;
}
/* If the first character is '^', set the negation flag and skip it. Also,
if the first few characters (either before or after ^) are \Q\E or \E we
skip them too. This makes for compatibility with Perl. */
negate_class = FALSE;
for (;;)
{
c = *(++ptr);
if (c == '\\')
{
if (ptr[1] == 'E') ptr++;
else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
else break;
}
else if (!negate_class && c == '^')
negate_class = TRUE;
else break;
}
/* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
an initial ']' is taken as a data character -- the code below handles
that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
[^] must match any character, so generate OP_ALLANY. */
if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
{
*code++ = negate_class? OP_ALLANY : OP_FAIL;
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
zerofirstbyte = firstbyte;
break;
}
/* If a class contains a negative special such as \S, we need to flip the
negation flag at the end, so that support for characters > 255 works
correctly (they are all included in the class). */
should_flip_negation = FALSE;
/* Keep a count of chars with values < 256 so that we can optimize the case
of just a single character (as long as it's < 256). However, For higher
valued UTF-8 characters, we don't yet do any optimization. */
class_charcount = 0;
class_lastchar = -1;
/* Initialize the 32-char bit map to all zeros. We build the map in a
temporary bit of memory, in case the class contains only 1 character (less
than 256), because in that case the compiled code doesn't use the bit map.
*/
memset(classbits, 0, 32 * sizeof(uschar));
#ifdef SUPPORT_UTF8
class_utf8 = FALSE; /* No chars >= 256 */
class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
#endif
/* Process characters until ] is reached. By writing this as a "do" it
means that an initial ] is taken as a data character. At the start of the
loop, c contains the first byte of the character. */
if (c != 0) do
{
const uschar *oldptr;
#ifdef SUPPORT_UTF8
if (utf8 && c > 127)
{ /* Braces are required because the */
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
}
/* In the pre-compile phase, accumulate the length of any UTF-8 extra
data and reset the pointer. This is so that very large classes that
contain a zillion UTF-8 characters no longer overwrite the work space
(which is on the stack). */
if (lengthptr != NULL)
{
*lengthptr += class_utf8data - class_utf8data_base;
class_utf8data = class_utf8data_base;
}
#endif
/* Inside \Q...\E everything is literal except \E */
if (inescq)
{
if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
{
inescq = FALSE; /* Reset literal state */
ptr++; /* Skip the 'E' */
continue; /* Carry on with next */
}
goto CHECK_RANGE; /* Could be range if \E follows */
}
/* Handle POSIX class names. Perl allows a negation extension of the
form [:^name:]. A square bracket that doesn't match the syntax is
treated as a literal. We also recognize the POSIX constructions
[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
5.6 and 5.8 do. */
if (c == '[' &&
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
check_posix_syntax(ptr, &tempptr))
{
BOOL local_negate = FALSE;
int posix_class, taboffset, tabopt;
register const uschar *cbits = cd->cbits;
uschar pbits[32];
if (ptr[1] != ':')
{
*errorcodeptr = ERR31;
goto FAILED;
}
ptr += 2;
if (*ptr == '^')
{
local_negate = TRUE;
should_flip_negation = TRUE; /* Note negative special */
ptr++;
}
posix_class = check_posix_name(ptr, tempptr - ptr);
if (posix_class < 0)
{
*errorcodeptr = ERR30;
goto FAILED;
}
/* If matching is caseless, upper and lower are converted to
alpha. This relies on the fact that the class table starts with
alpha, lower, upper as the first 3 entries. */
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
posix_class = 0;
/* We build the bit map for the POSIX class in a chunk of local store
because we may be adding and subtracting from it, and we don't want to
subtract bits that may be in the main map already. At the end we or the
result into the bit map that is being built. */
posix_class *= 3;
/* Copy in the first table (always present) */
memcpy(pbits, cbits + posix_class_maps[posix_class],
32 * sizeof(uschar));
/* If there is a second table, add or remove it as required. */
taboffset = posix_class_maps[posix_class + 1];
tabopt = posix_class_maps[posix_class + 2];
if (taboffset >= 0)
{
if (tabopt >= 0)
for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
else
for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
}
/* Not see if we need to remove any special characters. An option
value of 1 removes vertical space and 2 removes underscore. */
if (tabopt < 0) tabopt = -tabopt;
if (tabopt == 1) pbits[1] &= ~0x3c;
else if (tabopt == 2) pbits[11] &= 0x7f;
/* Add the POSIX table or its complement into the main table that is
being built and we are done. */
if (local_negate)
for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
else
for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
ptr = tempptr + 1;
class_charcount = 10; /* Set > 1; assumes more than 1 per class */
continue; /* End of POSIX syntax handling */
}
/* Backslash may introduce a single character, or it may introduce one
of the specials, which just set a flag. The sequence \b is a special
case. Inside a class (and only there) it is treated as backspace.
Elsewhere it marks a word boundary. Other escapes have preset maps ready
to 'or' into the one we are building. We assume they have more than one
character in them, so set class_charcount bigger than one. */
if (c == '\\')
{
c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
if (*errorcodeptr != 0) goto FAILED;
if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
else if (-c == ESC_Q) /* Handle start of quoted string */
{
if (ptr[1] == '\\' && ptr[2] == 'E')
{
ptr += 2; /* avoid empty string */
}
else inescq = TRUE;
continue;
}
else if (-c == ESC_E) continue; /* Ignore orphan \E */
if (c < 0)
{
register const uschar *cbits = cd->cbits;
class_charcount += 2; /* Greater than 1 is what matters */
/* Save time by not doing this in the pre-compile phase. */
if (lengthptr == NULL) switch (-c)
{
case ESC_d:
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
continue;
case ESC_D:
should_flip_negation = TRUE;
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
continue;
case ESC_w:
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
continue;
case ESC_W:
should_flip_negation = TRUE;
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
continue;
case ESC_s:
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
continue;
case ESC_S:
should_flip_negation = TRUE;
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
continue;
default: /* Not recognized; fall through */
break; /* Need "default" setting to stop compiler warning. */
}
/* In the pre-compile phase, just do the recognition. */
else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
/* We need to deal with \H, \h, \V, and \v in both phases because
they use extra memory. */
if (-c == ESC_h)
{
SETBIT(classbits, 0x09); /* VT */
SETBIT(classbits, 0x20); /* SPACE */
SETBIT(classbits, 0xa0); /* NSBP */
#ifdef SUPPORT_UTF8
if (utf8)
{
class_utf8 = TRUE;
*class_utf8data++ = XCL_SINGLE;
class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
*class_utf8data++ = XCL_SINGLE;
class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
*class_utf8data++ = XCL_SINGLE;
class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
*class_utf8data++ = XCL_SINGLE;
class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
*class_utf8data++ = XCL_SINGLE;
class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
}
#endif
continue;
}
if (-c == ESC_H)
{
for (c = 0; c < 32; c++)
{
int x = 0xff;
switch (c)
{
case 0x09/8: x ^= 1 << (0x09%8); break;
case 0x20/8: x ^= 1 << (0x20%8); break;
case 0xa0/8: x ^= 1 << (0xa0%8); break;
default: break;
}
classbits[c] |= x;
}
#ifdef SUPPORT_UTF8
if (utf8)
{
class_utf8 = TRUE;
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
}
#endif
continue;
}
if (-c == ESC_v)
{
SETBIT(classbits, 0x0a); /* LF */
SETBIT(classbits, 0x0b); /* VT */
SETBIT(classbits, 0x0c); /* FF */
SETBIT(classbits, 0x0d); /* CR */
SETBIT(classbits, 0x85); /* NEL */
#ifdef SUPPORT_UTF8
if (utf8)
{
class_utf8 = TRUE;
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
}
#endif
continue;
}
if (-c == ESC_V)
{
for (c = 0; c < 32; c++)
{
int x = 0xff;
switch (c)
{
case 0x0a/8: x ^= 1 << (0x0a%8);
x ^= 1 << (0x0b%8);
x ^= 1 << (0x0c%8);
x ^= 1 << (0x0d%8);
break;
case 0x85/8: x ^= 1 << (0x85%8); break;
default: break;
}
classbits[c] |= x;
}
#ifdef SUPPORT_UTF8
if (utf8)
{
class_utf8 = TRUE;
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
}
#endif
continue;
}
/* We need to deal with \P and \p in both phases. */
#ifdef SUPPORT_UCP
if (-c == ESC_p || -c == ESC_P)
{
BOOL negated;
int pdata;
int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
if (ptype < 0) goto FAILED;
class_utf8 = TRUE;
*class_utf8data++ = ((-c == ESC_p) != negated)?
XCL_PROP : XCL_NOTPROP;
*class_utf8data++ = ptype;
*class_utf8data++ = pdata;
class_charcount -= 2; /* Not a < 256 character */
continue;
}
#endif
/* Unrecognized escapes are faulted if PCRE is running in its
strict mode. By default, for compatibility with Perl, they are
treated as literals. */
if ((options & PCRE_EXTRA) != 0)
{
*errorcodeptr = ERR7;
goto FAILED;
}
class_charcount -= 2; /* Undo the default count from above */
c = *ptr; /* Get the final character and fall through */
}
/* Fall through if we have a single character (c >= 0). This may be
greater than 256 in UTF-8 mode. */
} /* End of backslash handling */
/* A single character may be followed by '-' to form a range. However,
Perl does not permit ']' to be the end of the range. A '-' character
at the end is treated as a literal. Perl ignores orphaned \E sequences
entirely. The code for handling \Q and \E is messy. */
CHECK_RANGE:
while (ptr[1] == '\\' && ptr[2] == 'E')
{
inescq = FALSE;
ptr += 2;
}
oldptr = ptr;
/* Remember \r or \n */
if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
/* Check for range */
if (!inescq && ptr[1] == '-')
{
int d;
ptr += 2;
while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
/* If we hit \Q (not followed by \E) at this point, go into escaped
mode. */
while (*ptr == '\\' && ptr[1] == 'Q')
{
ptr += 2;
if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
inescq = TRUE;
break;
}
if (*ptr == 0 || (!inescq && *ptr == ']'))
{
ptr = oldptr;
goto LONE_SINGLE_CHARACTER;
}
#ifdef SUPPORT_UTF8
if (utf8)
{ /* Braces are required because the */
GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
}
else
#endif
d = *ptr; /* Not UTF-8 mode */
/* The second part of a range can be a single-character escape, but
not any of the other escapes. Perl 5.6 treats a hyphen as a literal
in such circumstances. */
if (!inescq && d == '\\')
{
d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
if (*errorcodeptr != 0) goto FAILED;
/* \b is backspace; \X is literal X; \R is literal R; any other
special means the '-' was literal */
if (d < 0)
{
if (d == -ESC_b) d = '\b';
else if (d == -ESC_X) d = 'X';
else if (d == -ESC_R) d = 'R'; else
{
ptr = oldptr;
goto LONE_SINGLE_CHARACTER; /* A few lines below */
}
}
}
/* Check that the two values are in the correct order. Optimize
one-character ranges */
if (d < c)
{
*errorcodeptr = ERR8;
goto FAILED;
}
if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
/* Remember \r or \n */
if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
/* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
matching, we have to use an XCLASS with extra data items. Caseless
matching for characters > 127 is available only if UCP support is
available. */
#ifdef SUPPORT_UTF8
if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
{
class_utf8 = TRUE;
/* With UCP support, we can find the other case equivalents of
the relevant characters. There may be several ranges. Optimize how
they fit with the basic range. */
#ifdef SUPPORT_UCP
if ((options & PCRE_CASELESS) != 0)
{
unsigned int occ, ocd;
unsigned int cc = c;
unsigned int origd = d;
while (get_othercase_range(&cc, origd, &occ, &ocd))
{
if (occ >= (unsigned int)c &&
ocd <= (unsigned int)d)
continue; /* Skip embedded ranges */
if (occ < (unsigned int)c &&
ocd >= (unsigned int)c - 1) /* Extend the basic range */
{ /* if there is overlap, */
c = occ; /* noting that if occ < c */
continue; /* we can't have ocd > d */
} /* because a subrange is */
if (ocd > (unsigned int)d &&
occ <= (unsigned int)d + 1) /* always shorter than */
{ /* the basic range. */
d = ocd;
continue;
}
if (occ == ocd)
{
*class_utf8data++ = XCL_SINGLE;
}
else
{
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
}
class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
}
}
#endif /* SUPPORT_UCP */
/* Now record the original range, possibly modified for UCP caseless
overlapping ranges. */
*class_utf8data++ = XCL_RANGE;
class_utf8data += _pcre_ord2utf8(c, class_utf8data);
class_utf8data += _pcre_ord2utf8(d, class_utf8data);
/* With UCP support, we are done. Without UCP support, there is no
caseless matching for UTF-8 characters > 127; we can use the bit map
for the smaller ones. */
#ifdef SUPPORT_UCP
continue; /* With next character in the class */
#else
if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
/* Adjust upper limit and fall through to set up the map */
d = 127;
#endif /* SUPPORT_UCP */
}
#endif /* SUPPORT_UTF8 */
/* We use the bit map for all cases when not in UTF-8 mode; else
ranges that lie entirely within 0-127 when there is UCP support; else
for partial ranges without UCP support. */
class_charcount += d - c + 1;
class_lastchar = d;
/* We can save a bit of time by skipping this in the pre-compile. */
if (lengthptr == NULL) for (; c <= d; c++)
{
classbits[c/8] |= (1 << (c&7));
if ((options & PCRE_CASELESS) != 0)
{
int uc = cd->fcc[c]; /* flip case */
classbits[uc/8] |= (1 << (uc&7));
}
}
continue; /* Go get the next char in the class */
}
/* Handle a lone single character - we can get here for a normal
non-escape char, or after \ that introduces a single character or for an
apparent range that isn't. */
LONE_SINGLE_CHARACTER:
/* Handle a character that cannot go in the bit map */
#ifdef SUPPORT_UTF8
if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
{
class_utf8 = TRUE;
*class_utf8data++ = XCL_SINGLE;
class_utf8data += _pcre_ord2utf8(c, class_utf8data);
#ifdef SUPPORT_UCP
if ((options & PCRE_CASELESS) != 0)
{
unsigned int othercase;
if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
{
*class_utf8data++ = XCL_SINGLE;
class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
}
}
#endif /* SUPPORT_UCP */
}
else
#endif /* SUPPORT_UTF8 */
/* Handle a single-byte character */
{
classbits[c/8] |= (1 << (c&7));
if ((options & PCRE_CASELESS) != 0)
{
c = cd->fcc[c]; /* flip case */
classbits[c/8] |= (1 << (c&7));
}
class_charcount++;
class_lastchar = c;
}
}
/* Loop until ']' reached. This "while" is the end of the "do" above. */
while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
if (c == 0) /* Missing terminating ']' */
{
*errorcodeptr = ERR6;
goto FAILED;
}
/* This code has been disabled because it would mean that \s counts as
an explicit \r or \n reference, and that's not really what is wanted. Now
we set the flag only if there is a literal "\r" or "\n" in the class. */
#if 0
/* Remember whether \r or \n are in this class */
if (negate_class)
{
if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
}
else
{
if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
}
#endif
/* If class_charcount is 1, we saw precisely one character whose value is
less than 256. As long as there were no characters >= 128 and there was no
use of \p or \P, in other words, no use of any XCLASS features, we can
optimize.
In UTF-8 mode, we can optimize the negative case only if there were no
characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
operate on single-bytes only. This is an historical hangover. Maybe one day
we can tidy these opcodes to handle multi-byte characters.
The optimization throws away the bit map. We turn the item into a
1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
that OP_NOT does not support multibyte characters. In the positive case, it
can cause firstbyte to be set. Otherwise, there can be no first char if
this item is first, whatever repeat count may follow. In the case of
reqbyte, save the previous value for reinstating. */
#ifdef SUPPORT_UTF8
if (class_charcount == 1 && !class_utf8 &&
(!utf8 || !negate_class || class_lastchar < 128))
#else
if (class_charcount == 1)
#endif
{
zeroreqbyte = reqbyte;
/* The OP_NOT opcode works on one-byte characters only. */
if (negate_class)
{
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
zerofirstbyte = firstbyte;
*code++ = OP_NOT;
*code++ = class_lastchar;
break;
}
/* For a single, positive character, get the value into mcbuffer, and
then we can handle this with the normal one-character code. */
#ifdef SUPPORT_UTF8
if (utf8 && class_lastchar > 127)
mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
else
#endif
{
mcbuffer[0] = class_lastchar;
mclength = 1;
}
goto ONE_CHAR;
} /* End of 1-char optimization */
/* The general case - not the one-char optimization. If this is the first
thing in the branch, there can be no first char setting, whatever the
repeat count. Any reqbyte setting must remain unchanged after any kind of
repeat. */
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
zerofirstbyte = firstbyte;
zeroreqbyte = reqbyte;
/* If there are characters with values > 255, we have to compile an
extended class, with its own opcode, unless there was a negated special
such as \S in the class, because in that case all characters > 255 are in
the class, so any that were explicitly given as well can be ignored. If
(when there are explicit characters > 255 that must be listed) there are no
characters < 256, we can omit the bitmap in the actual compiled code. */
#ifdef SUPPORT_UTF8
if (class_utf8 && !should_flip_negation)
{
*class_utf8data++ = XCL_END; /* Marks the end of extra data */
*code++ = OP_XCLASS;
code += LINK_SIZE;
*code = negate_class? XCL_NOT : 0;
/* If the map is required, move up the extra data to make room for it;
otherwise just move the code pointer to the end of the extra data. */
if (class_charcount > 0)
{
*code++ |= XCL_MAP;
memmove(code + 32, code, class_utf8data - code);
memcpy(code, classbits, 32);
code = class_utf8data + 32;
}
else code = class_utf8data;
/* Now fill in the complete length of the item */
PUT(previous, 1, code - previous);
break; /* End of class handling */
}
#endif
/* If there are no characters > 255, set the opcode to OP_CLASS or
OP_NCLASS, depending on whether the whole class was negated and whether
there were negative specials such as \S in the class. Then copy the 32-byte
map into the code vector, negating it if necessary. */
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
if (negate_class)
{
if (lengthptr == NULL) /* Save time in the pre-compile phase */
for (c = 0; c < 32; c++) code[c] = ~classbits[c];
}
else
{
memcpy(code, classbits, 32);
}
code += 32;
break;
/* ===================================================================*/
/* Various kinds of repeat; '{' is not necessarily a quantifier, but this
has been tested above. */
case '{':
if (!is_quantifier) goto NORMAL_CHAR;
ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
if (*errorcodeptr != 0) goto FAILED;
goto REPEAT;
case '*':
repeat_min = 0;
repeat_max = -1;
goto REPEAT;
case '+':
repeat_min = 1;
repeat_max = -1;
goto REPEAT;
case '?':
repeat_min = 0;
repeat_max = 1;
REPEAT:
if (previous == NULL)
{
*errorcodeptr = ERR9;
goto FAILED;
}
if (repeat_min == 0)
{
firstbyte = zerofirstbyte; /* Adjust for zero repeat */
reqbyte = zeroreqbyte; /* Ditto */
}
/* Remember whether this is a variable length repeat */
reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
op_type = 0; /* Default single-char op codes */
possessive_quantifier = FALSE; /* Default not possessive quantifier */
/* Save start of previous item, in case we have to move it up to make space
for an inserted OP_ONCE for the additional '+' extension. */
tempcode = previous;
/* If the next character is '+', we have a possessive quantifier. This
implies greediness, whatever the setting of the PCRE_UNGREEDY option.
If the next character is '?' this is a minimizing repeat, by default,
but if PCRE_UNGREEDY is set, it works the other way round. We change the
repeat type to the non-default. */
if (ptr[1] == '+')
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
ptr++;
}
else if (ptr[1] == '?')
{
repeat_type = greedy_non_default;
ptr++;
}
else repeat_type = greedy_default;
/* If previous was a character match, abolish the item and generate a
repeat item instead. If a char item has a minumum of more than one, ensure
that it is set in reqbyte - it might not be if a sequence such as x{3} is
the first thing in a branch because the x will have gone into firstbyte
instead. */
if (*previous == OP_CHAR || *previous == OP_CHARNC)
{
/* Deal with UTF-8 characters that take up more than one byte. It's
easier to write this out separately than try to macrify it. Use c to
hold the length of the character in bytes, plus 0x80 to flag that it's a
length rather than a small character. */
#ifdef SUPPORT_UTF8
if (utf8 && (code[-1] & 0x80) != 0)
{
uschar *lastchar = code - 1;
while((*lastchar & 0xc0) == 0x80) lastchar--;
c = code - lastchar; /* Length of UTF-8 character */
memcpy(utf8_char, lastchar, c); /* Save the char */
c |= 0x80; /* Flag c as a length */
}
else
#endif
/* Handle the case of a single byte - either with no UTF8 support, or
with UTF-8 disabled, or for a UTF-8 character < 128. */
{
c = code[-1];
if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
}
/* If the repetition is unlimited, it pays to see if the next thing on
the line is something that cannot possibly match this character. If so,
automatically possessifying this item gains some performance in the case
where the match fails. */
if (!possessive_quantifier &&
repeat_max < 0 &&
check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
}
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
}
/* If previous was a single negated character ([^a] or similar), we use
one of the special opcodes, replacing it. The code is shared with single-
character repeats by setting opt_type to add a suitable offset into
repeat_type. We can also test for auto-possessification. OP_NOT is
currently used only for single-byte chars. */
else if (*previous == OP_NOT)
{
op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
c = previous[1];
if (!possessive_quantifier &&
repeat_max < 0 &&
check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
}
goto OUTPUT_SINGLE_REPEAT;
}
/* If previous was a character type match (\d or similar), abolish it and
create a suitable repeat item. The code is shared with single-character
repeats by setting op_type to add a suitable offset into repeat_type. Note
the the Unicode property types will be present only when SUPPORT_UCP is
defined, but we don't wrap the little bits of code here because it just
makes it horribly messy. */
else if (*previous < OP_EODN)
{
uschar *oldcode;
int prop_type, prop_value;
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
c = *previous;
if (!possessive_quantifier &&
repeat_max < 0 &&
check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
}
OUTPUT_SINGLE_REPEAT:
if (*previous == OP_PROP || *previous == OP_NOTPROP)
{
prop_type = previous[1];
prop_value = previous[2];
}
else prop_type = prop_value = -1;
oldcode = code;
code = previous; /* Usually overwrite previous item */
/* If the maximum is zero then the minimum must also be zero; Perl allows
this case, so we do too - by simply omitting the item altogether. */
if (repeat_max == 0) goto END_REPEAT;
/* All real repeats make it impossible to handle partial matching (maybe
one day we will be able to remove this restriction). */
if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
/* Combine the op_type with the repeat_type */
repeat_type += op_type;
/* A minimum of zero is handled either as the special case * or ?, or as
an UPTO, with the maximum given. */
if (repeat_min == 0)
{
if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
else
{
*code++ = OP_UPTO + repeat_type;
PUT2INC(code, 0, repeat_max);
}
}
/* A repeat minimum of 1 is optimized into some special cases. If the
maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
left in place and, if the maximum is greater than 1, we use OP_UPTO with
one less than the maximum. */
else if (repeat_min == 1)
{
if (repeat_max == -1)
*code++ = OP_PLUS + repeat_type;
else
{
code = oldcode; /* leave previous item in place */
if (repeat_max == 1) goto END_REPEAT;
*code++ = OP_UPTO + repeat_type;
PUT2INC(code, 0, repeat_max - 1);
}
}
/* The case {n,n} is just an EXACT, while the general case {n,m} is
handled as an EXACT followed by an UPTO. */
else
{
*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
PUT2INC(code, 0, repeat_min);
/* If the maximum is unlimited, insert an OP_STAR. Before doing so,
we have to insert the character for the previous code. For a repeated
Unicode property match, there are two extra bytes that define the
required property. In UTF-8 mode, long characters have their length in
c, with the 0x80 bit as a flag. */
if (repeat_max < 0)
{
#ifdef SUPPORT_UTF8
if (utf8 && c >= 128)
{
memcpy(code, utf8_char, c & 7);
code += c & 7;
}
else
#endif
{
*code++ = c;
if (prop_type >= 0)
{
*code++ = prop_type;
*code++ = prop_value;
}
}
*code++ = OP_STAR + repeat_type;
}
/* Else insert an UPTO if the max is greater than the min, again
preceded by the character, for the previously inserted code. If the
UPTO is just for 1 instance, we can use QUERY instead. */
else if (repeat_max != repeat_min)
{
#ifdef SUPPORT_UTF8
if (utf8 && c >= 128)
{
memcpy(code, utf8_char, c & 7);
code += c & 7;
}
else
#endif
*code++ = c;
if (prop_type >= 0)
{
*code++ = prop_type;
*code++ = prop_value;
}
repeat_max -= repeat_min;
if (repeat_max == 1)
{
*code++ = OP_QUERY + repeat_type;
}
else
{
*code++ = OP_UPTO + repeat_type;
PUT2INC(code, 0, repeat_max);
}
}
}
/* The character or character type itself comes last in all cases. */
#ifdef SUPPORT_UTF8
if (utf8 && c >= 128)
{
memcpy(code, utf8_char, c & 7);
code += c & 7;
}
else
#endif
*code++ = c;
/* For a repeated Unicode property match, there are two extra bytes that
define the required property. */
#ifdef SUPPORT_UCP
if (prop_type >= 0)
{
*code++ = prop_type;
*code++ = prop_value;
}
#endif
}
/* If previous was a character class or a back reference, we put the repeat
stuff after it, but just skip the item if the repeat was {0,0}. */
else if (*previous == OP_CLASS ||
*previous == OP_NCLASS ||
#ifdef SUPPORT_UTF8
*previous == OP_XCLASS ||
#endif
*previous == OP_REF)
{
if (repeat_max == 0)
{
code = previous;
goto END_REPEAT;
}
/* All real repeats make it impossible to handle partial matching (maybe
one day we will be able to remove this restriction). */
if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
if (repeat_min == 0 && repeat_max == -1)
*code++ = OP_CRSTAR + repeat_type;
else if (repeat_min == 1 && repeat_max == -1)
*code++ = OP_CRPLUS + repeat_type;
else if (repeat_min == 0 && repeat_max == 1)
*code++ = OP_CRQUERY + repeat_type;
else
{
*code++ = OP_CRRANGE + repeat_type;
PUT2INC(code, 0, repeat_min);
if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
PUT2INC(code, 0, repeat_max);
}
}
/* If previous was a bracket group, we may have to replicate it in certain
cases. */
else if (*previous == OP_BRA || *previous == OP_CBRA ||
*previous == OP_ONCE || *previous == OP_COND)
{
register int i;
int ketoffset = 0;
int len = code - previous;
uschar *bralink = NULL;
/* Repeating a DEFINE group is pointless */
if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
{
*errorcodeptr = ERR55;
goto FAILED;
}
/* If the maximum repeat count is unlimited, find the end of the bracket
by scanning through from the start, and compute the offset back to it
from the current code pointer. There may be an OP_OPT setting following
the final KET, so we can't find the end just by going back from the code
pointer. */
if (repeat_max == -1)
{
register uschar *ket = previous;
do ket += GET(ket, 1); while (*ket != OP_KET);
ketoffset = code - ket;
}
/* The case of a zero minimum is special because of the need to stick
OP_BRAZERO in front of it, and because the group appears once in the
data, whereas in other cases it appears the minimum number of times. For
this reason, it is simplest to treat this case separately, as otherwise
the code gets far too messy. There are several special subcases when the
minimum is zero. */
if (repeat_min == 0)
{
/* If the maximum is also zero, we used to just omit the group from the
output altogether, like this:
** if (repeat_max == 0)
** {
** code = previous;
** goto END_REPEAT;
** }
However, that fails when a group is referenced as a subroutine from
elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
so that it is skipped on execution. As we don't have a list of which
groups are referenced, we cannot do this selectively.
If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
and do no more at this point. However, we do need to adjust any
OP_RECURSE calls inside the group that refer to the group itself or any
internal or forward referenced group, because the offset is from the
start of the whole regex. Temporarily terminate the pattern while doing
this. */
if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
{
*code = OP_END;
adjust_recurse(previous, 1, utf8, cd, save_hwm);
memmove(previous+1, previous, len);
code++;
if (repeat_max == 0)
{
*previous++ = OP_SKIPZERO;
goto END_REPEAT;
}
*previous++ = OP_BRAZERO + repeat_type;
}
/* If the maximum is greater than 1 and limited, we have to replicate
in a nested fashion, sticking OP_BRAZERO before each set of brackets.
The first one has to be handled carefully because it's the original
copy, which has to be moved up. The remainder can be handled by code
that is common with the non-zero minimum case below. We have to
adjust the value or repeat_max, since one less copy is required. Once
again, we may have to adjust any OP_RECURSE calls inside the group. */
else
{
int offset;
*code = OP_END;
adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
memmove(previous + 2 + LINK_SIZE, previous, len);
code += 2 + LINK_SIZE;
*previous++ = OP_BRAZERO + repeat_type;
*previous++ = OP_BRA;
/* We chain together the bracket offset fields that have to be
filled in later when the ends of the brackets are reached. */
offset = (bralink == NULL)? 0 : previous - bralink;
bralink = previous;
PUTINC(previous, 0, offset);
}
repeat_max--;
}
/* If the minimum is greater than zero, replicate the group as many
times as necessary, and adjust the maximum to the number of subsequent
copies that we need. If we set a first char from the group, and didn't
set a required char, copy the latter from the former. If there are any
forward reference subroutine calls in the group, there will be entries on
the workspace list; replicate these with an appropriate increment. */
else
{
if (repeat_min > 1)
{
/* In the pre-compile phase, we don't actually do the replication. We
just adjust the length as if we had. Do some paranoid checks for
potential integer overflow. */
if (lengthptr != NULL)
{
int delta = (repeat_min - 1)*length_prevgroup;
if ((double)(repeat_min - 1)*(double)length_prevgroup >
(double)INT_MAX ||
OFLOW_MAX - *lengthptr < delta)
{
*errorcodeptr = ERR20;
goto FAILED;
}
*lengthptr += delta;
}
/* This is compiling for real */
else
{
if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
for (i = 1; i < repeat_min; i++)
{
uschar *hc;
uschar *this_hwm = cd->hwm;
memcpy(code, previous, len);
for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
{
PUT(cd->hwm, 0, GET(hc, 0) + len);
cd->hwm += LINK_SIZE;
}
save_hwm = this_hwm;
code += len;
}
}
}
if (repeat_max > 0) repeat_max -= repeat_min;
}
/* This code is common to both the zero and non-zero minimum cases. If
the maximum is limited, it replicates the group in a nested fashion,
remembering the bracket starts on a stack. In the case of a zero minimum,
the first one was set up above. In all cases the repeat_max now specifies
the number of additional copies needed. Again, we must remember to
replicate entries on the forward reference list. */
if (repeat_max >= 0)
{
/* In the pre-compile phase, we don't actually do the replication. We
just adjust the length as if we had. For each repetition we must add 1
to the length for BRAZERO and for all but the last repetition we must
add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
paranoid checks to avoid integer overflow. */
if (lengthptr != NULL && repeat_max > 0)
{
int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
2 - 2*LINK_SIZE; /* Last one doesn't nest */
if ((double)repeat_max *
(double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
> (double)INT_MAX ||
OFLOW_MAX - *lengthptr < delta)
{
*errorcodeptr = ERR20;
goto FAILED;
}
*lengthptr += delta;
}
/* This is compiling for real */
else for (i = repeat_max - 1; i >= 0; i--)
{
uschar *hc;
uschar *this_hwm = cd->hwm;
*code++ = OP_BRAZERO + repeat_type;
/* All but the final copy start a new nesting, maintaining the
chain of brackets outstanding. */
if (i != 0)
{
int offset;
*code++ = OP_BRA;
offset = (bralink == NULL)? 0 : code - bralink;
bralink = code;
PUTINC(code, 0, offset);
}
memcpy(code, previous, len);
for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
{
PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
cd->hwm += LINK_SIZE;
}
save_hwm = this_hwm;
code += len;
}
/* Now chain through the pending brackets, and fill in their length
fields (which are holding the chain links pro tem). */
while (bralink != NULL)
{
int oldlinkoffset;
int offset = code - bralink + 1;
uschar *bra = code - offset;
oldlinkoffset = GET(bra, 1);
bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
*code++ = OP_KET;
PUTINC(code, 0, offset);
PUT(bra, 1, offset);
}
}
/* If the maximum is unlimited, set a repeater in the final copy. We
can't just offset backwards from the current code point, because we
don't know if there's been an options resetting after the ket. The
correct offset was computed above.
Then, when we are doing the actual compile phase, check to see whether
this group is a non-atomic one that could match an empty string. If so,
convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
that runtime checking can be done. [This check is also applied to
atomic groups at runtime, but in a different way.] */
else
{
uschar *ketcode = code - ketoffset;
uschar *bracode = ketcode - GET(ketcode, 1);
*ketcode = OP_KETRMAX + repeat_type;
if (lengthptr == NULL && *bracode != OP_ONCE)
{
uschar *scode = bracode;
do
{
if (could_be_empty_branch(scode, ketcode, utf8))
{
*bracode += OP_SBRA - OP_BRA;
break;
}
scode += GET(scode, 1);
}
while (*scode == OP_ALT);
}
}
}
/* If previous is OP_FAIL, it was generated by an empty class [] in
JavaScript mode. The other ways in which OP_FAIL can be generated, that is
by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
error above. We can just ignore the repeat in JS case. */
else if (*previous == OP_FAIL) goto END_REPEAT;
/* Else there's some kind of shambles */
else
{
*errorcodeptr = ERR11;
goto FAILED;
}
/* If the character following a repeat is '+', or if certain optimization
tests above succeeded, possessive_quantifier is TRUE. For some of the
simpler opcodes, there is an special alternative opcode for this. For
anything else, we wrap the entire repeated item inside OP_ONCE brackets.
The '+' notation is just syntactic sugar, taken from Sun's Java package,
but the special opcodes can optimize it a bit. The repeated item starts at
tempcode, not at previous, which might be the first part of a string whose
(former) last char we repeated.
Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
an 'upto' may follow. We skip over an 'exact' item, and then test the
length of what remains before proceeding. */
if (possessive_quantifier)
{
int len;
if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
*tempcode == OP_NOTEXACT)
tempcode += _pcre_OP_lengths[*tempcode] +
((*tempcode == OP_TYPEEXACT &&
(tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
len = code - tempcode;
if (len > 0) switch (*tempcode)
{
case OP_STAR: *tempcode = OP_POSSTAR; break;
case OP_PLUS: *tempcode = OP_POSPLUS; break;
case OP_QUERY: *tempcode = OP_POSQUERY; break;
case OP_UPTO: *tempcode = OP_POSUPTO; break;
case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
default:
memmove(tempcode + 1+LINK_SIZE, tempcode, len);
code += 1 + LINK_SIZE;
len += 1 + LINK_SIZE;
tempcode[0] = OP_ONCE;
*code++ = OP_KET;
PUTINC(code, 0, len);
PUT(tempcode, 1, len);
break;
}
}
/* In all case we no longer have a previous item. We also set the
"follows varying string" flag for subsequently encountered reqbytes if
it isn't already set and we have just passed a varying length item. */
END_REPEAT:
previous = NULL;
cd->req_varyopt |= reqvary;
break;
/* ===================================================================*/
/* Start of nested parenthesized sub-expression, or comment or lookahead or
lookbehind or option setting or condition or all the other extended
parenthesis forms. */
case '(':
newoptions = options;
skipbytes = 0;
bravalue = OP_CBRA;
save_hwm = cd->hwm;
reset_bracount = FALSE;
/* First deal with various "verbs" that can be introduced by '*'. */
if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
{
int i, namelen;
const char *vn = verbnames;
const uschar *name = ++ptr;
previous = NULL;
while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
if (*ptr == ':')
{
*errorcodeptr = ERR59; /* Not supported */
goto FAILED;
}
if (*ptr != ')')
{
*errorcodeptr = ERR60;
goto FAILED;
}
namelen = ptr - name;
for (i = 0; i < verbcount; i++)
{
if (namelen == verbs[i].len &&
strncmp((char *)name, vn, namelen) == 0)
{
*code = verbs[i].op;
if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
break;
}
vn += verbs[i].len + 1;
}
if (i < verbcount) continue;
*errorcodeptr = ERR60;
goto FAILED;
}
/* Deal with the extended parentheses; all are introduced by '?', and the
appearance of any of them means that this is not a capturing group. */
else if (*ptr == '?')
{
int i, set, unset, namelen;
int *optset;
const uschar *name;
uschar *slot;
switch (*(++ptr))
{
case '#': /* Comment; skip to ket */
ptr++;
while (*ptr != 0 && *ptr != ')') ptr++;
if (*ptr == 0)
{
*errorcodeptr = ERR18;
goto FAILED;
}
continue;
/* ------------------------------------------------------------ */
case '|': /* Reset capture count for each branch */
reset_bracount = TRUE;
/* Fall through */
/* ------------------------------------------------------------ */
case ':': /* Non-capturing bracket */
bravalue = OP_BRA;
ptr++;
break;
/* ------------------------------------------------------------ */
case '(':
bravalue = OP_COND; /* Conditional group */
/* A condition can be an assertion, a number (referring to a numbered
group), a name (referring to a named group), or 'R', referring to
recursion. R and R&name are also permitted for recursion tests.
There are several syntaxes for testing a named group: (?(name)) is used
by Python; Perl 5.10 onwards uses (?() or (?('name')).
There are two unfortunate ambiguities, caused by history. (a) 'R' can
be the recursive thing or the name 'R' (and similarly for 'R' followed
by digits), and (b) a number could be a name that consists of digits.
In both cases, we look for a name first; if not found, we try the other
cases. */
/* For conditions that are assertions, check the syntax, and then exit
the switch. This will take control down to where bracketed groups,
including assertions, are processed. */
if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
break;
/* Most other conditions use OP_CREF (a couple change to OP_RREF
below), and all need to skip 3 bytes at the start of the group. */
code[1+LINK_SIZE] = OP_CREF;
skipbytes = 3;
refsign = -1;
/* Check for a test for recursion in a named group. */
if (ptr[1] == 'R' && ptr[2] == '&')
{
terminator = -1;
ptr += 2;
code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
}
/* Check for a test for a named group's having been set, using the Perl
syntax (?() or (?('name') */
else if (ptr[1] == '<')
{
terminator = '>';
ptr++;
}
else if (ptr[1] == '\'')
{
terminator = '\'';
ptr++;
}
else
{
terminator = 0;
if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
}
/* We now expect to read a name; any thing else is an error */
if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
{
ptr += 1; /* To get the right offset */
*errorcodeptr = ERR28;
goto FAILED;
}
/* Read the name, but also get it as a number if it's all digits */
recno = 0;
name = ++ptr;
while ((cd->ctypes[*ptr] & ctype_word) != 0)
{
if (recno >= 0)
recno = ((digitab[*ptr] & ctype_digit) != 0)?
recno * 10 + *ptr - '0' : -1;
ptr++;
}
namelen = ptr - name;
if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
{
ptr--; /* Error offset */
*errorcodeptr = ERR26;
goto FAILED;
}
/* Do no further checking in the pre-compile phase. */
if (lengthptr != NULL) break;
/* In the real compile we do the work of looking for the actual
reference. If the string started with "+" or "-" we require the rest to
be digits, in which case recno will be set. */
if (refsign > 0)
{
if (recno <= 0)
{
*errorcodeptr = ERR58;
goto FAILED;
}
recno = (refsign == '-')?
cd->bracount - recno + 1 : recno +cd->bracount;
if (recno <= 0 || recno > cd->final_bracount)
{
*errorcodeptr = ERR15;
goto FAILED;
}
PUT2(code, 2+LINK_SIZE, recno);
break;
}
/* Otherwise (did not start with "+" or "-"), start by looking for the
name. */
slot = cd->name_table;
for (i = 0; i < cd->names_found; i++)
{
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
slot += cd->name_entry_size;
}
/* Found a previous named subpattern */
if (i < cd->names_found)
{
recno = GET2(slot, 0);
PUT2(code, 2+LINK_SIZE, recno);
}
/* Search the pattern for a forward reference */
else if ((i = find_parens(ptr, cd, name, namelen,
(options & PCRE_EXTENDED) != 0)) > 0)
{
PUT2(code, 2+LINK_SIZE, i);
}
/* If terminator == 0 it means that the name followed directly after
the opening parenthesis [e.g. (?(abc)...] and in this case there are
some further alternatives to try. For the cases where terminator != 0
[things like (?(... or (?('name')... or (?(R&name)... ] we have
now checked all the possibilities, so give an error. */
else if (terminator != 0)
{
*errorcodeptr = ERR15;
goto FAILED;
}
/* Check for (?(R) for recursion. Allow digits after R to specify a
specific group number. */
else if (*name == 'R')
{
recno = 0;
for (i = 1; i < namelen; i++)
{
if ((digitab[name[i]] & ctype_digit) == 0)
{
*errorcodeptr = ERR15;
goto FAILED;
}
recno = recno * 10 + name[i] - '0';
}
if (recno == 0) recno = RREF_ANY;
code[1+LINK_SIZE] = OP_RREF; /* Change test type */
PUT2(code, 2+LINK_SIZE, recno);
}
/* Similarly, check for the (?(DEFINE) "condition", which is always
false. */
else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
{
code[1+LINK_SIZE] = OP_DEF;
skipbytes = 1;
}
/* Check for the "name" actually being a subpattern number. We are
in the second pass here, so final_bracount is set. */
else if (recno > 0 && recno <= cd->final_bracount)
{
PUT2(code, 2+LINK_SIZE, recno);
}
/* Either an unidentified subpattern, or a reference to (?(0) */
else
{
*errorcodeptr = (recno == 0)? ERR35: ERR15;
goto FAILED;
}
break;
/* ------------------------------------------------------------ */
case '=': /* Positive lookahead */
bravalue = OP_ASSERT;
ptr++;
break;
/* ------------------------------------------------------------ */
case '!': /* Negative lookahead */
ptr++;
if (*ptr == ')') /* Optimize (?!) */
{
*code++ = OP_FAIL;
previous = NULL;
continue;
}
bravalue = OP_ASSERT_NOT;
break;
/* ------------------------------------------------------------ */
case '<': /* Lookbehind or named define */
switch (ptr[1])
{
case '=': /* Positive lookbehind */
bravalue = OP_ASSERTBACK;
ptr += 2;
break;
case '!': /* Negative lookbehind */
bravalue = OP_ASSERTBACK_NOT;
ptr += 2;
break;
default: /* Could be name define, else bad */
if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
ptr++; /* Correct offset for error */
*errorcodeptr = ERR24;
goto FAILED;
}
break;
/* ------------------------------------------------------------ */
case '>': /* One-time brackets */
bravalue = OP_ONCE;
ptr++;
break;
/* ------------------------------------------------------------ */
case 'C': /* Callout - may be followed by digits; */
previous_callout = code; /* Save for later completion */
after_manual_callout = 1; /* Skip one item before completing */
*code++ = OP_CALLOUT;
{
int n = 0;
while ((digitab[*(++ptr)] & ctype_digit) != 0)
n = n * 10 + *ptr - '0';
if (*ptr != ')')
{
*errorcodeptr = ERR39;
goto FAILED;
}
if (n > 255)
{
*errorcodeptr = ERR38;
goto FAILED;
}
*code++ = n;
PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
PUT(code, LINK_SIZE, 0); /* Default length */
code += 2 * LINK_SIZE;
}
previous = NULL;
continue;
/* ------------------------------------------------------------ */
case 'P': /* Python-style named subpattern handling */
if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
{
is_recurse = *ptr == '>';
terminator = ')';
goto NAMED_REF_OR_RECURSE;
}
else if (*ptr != '<') /* Test for Python-style definition */
{
*errorcodeptr = ERR41;
goto FAILED;
}
/* Fall through to handle (?P< as (?< is handled */
/* ------------------------------------------------------------ */
DEFINE_NAME: /* Come here from (?< handling */
case '\'':
{
terminator = (*ptr == '<')? '>' : '\'';
name = ++ptr;
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
namelen = ptr - name;
/* In the pre-compile phase, just do a syntax check. */
if (lengthptr != NULL)
{
if (*ptr != terminator)
{
*errorcodeptr = ERR42;
goto FAILED;
}
if (cd->names_found >= MAX_NAME_COUNT)
{
*errorcodeptr = ERR49;
goto FAILED;
}
if (namelen + 3 > cd->name_entry_size)
{
cd->name_entry_size = namelen + 3;
if (namelen > MAX_NAME_SIZE)
{
*errorcodeptr = ERR48;
goto FAILED;
}
}
}
/* In the real compile, create the entry in the table */
else
{
slot = cd->name_table;
for (i = 0; i < cd->names_found; i++)
{
int crc = memcmp(name, slot+2, namelen);
if (crc == 0)
{
if (slot[2+namelen] == 0)
{
if ((options & PCRE_DUPNAMES) == 0)
{
*errorcodeptr = ERR43;
goto FAILED;
}
}
else crc = -1; /* Current name is substring */
}
if (crc < 0)
{
memmove(slot + cd->name_entry_size, slot,
(cd->names_found - i) * cd->name_entry_size);
break;
}
slot += cd->name_entry_size;
}
PUT2(slot, 0, cd->bracount + 1);
memcpy(slot + 2, name, namelen);
slot[2+namelen] = 0;
}
}
/* In both cases, count the number of names we've encountered. */
ptr++; /* Move past > or ' */
cd->names_found++;
goto NUMBERED_GROUP;
/* ------------------------------------------------------------ */
case '&': /* Perl recursion/subroutine syntax */
terminator = ')';
is_recurse = TRUE;
/* Fall through */
/* We come here from the Python syntax above that handles both
references (?P=name) and recursion (?P>name), as well as falling
through from the Perl recursion syntax (?&name). We also come here from
the Perl \k or \k'name' back reference syntax and the \k{name}
.NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
NAMED_REF_OR_RECURSE:
name = ++ptr;
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
namelen = ptr - name;
/* In the pre-compile phase, do a syntax check and set a dummy
reference number. */
if (lengthptr != NULL)
{
if (namelen == 0)
{
*errorcodeptr = ERR62;
goto FAILED;
}
if (*ptr != terminator)
{
*errorcodeptr = ERR42;
goto FAILED;
}
if (namelen > MAX_NAME_SIZE)
{
*errorcodeptr = ERR48;
goto FAILED;
}
recno = 0;
}
/* In the real compile, seek the name in the table. We check the name
first, and then check that we have reached the end of the name in the
table. That way, if the name that is longer than any in the table,
the comparison will fail without reading beyond the table entry. */
else
{
slot = cd->name_table;
for (i = 0; i < cd->names_found; i++)
{
if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
slot[2+namelen] == 0)
break;
slot += cd->name_entry_size;
}
if (i < cd->names_found) /* Back reference */
{
recno = GET2(slot, 0);
}
else if ((recno = /* Forward back reference */
find_parens(ptr, cd, name, namelen,
(options & PCRE_EXTENDED) != 0)) <= 0)
{
*errorcodeptr = ERR15;
goto FAILED;
}
}
/* In both phases, we can now go to the code than handles numerical
recursion or backreferences. */
if (is_recurse) goto HANDLE_RECURSION;
else goto HANDLE_REFERENCE;
/* ------------------------------------------------------------ */
case 'R': /* Recursion */
ptr++; /* Same as (?0) */
/* Fall through */
/* ------------------------------------------------------------ */
case '-': case '+':
case '0': case '1': case '2': case '3': case '4': /* Recursion or */
case '5': case '6': case '7': case '8': case '9': /* subroutine */
{
const uschar *called;
terminator = ')';
/* Come here from the \g<...> and \g'...' code (Oniguruma
compatibility). However, the syntax has been checked to ensure that
the ... are a (signed) number, so that neither ERR63 nor ERR29 will
be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
ever be taken. */
HANDLE_NUMERICAL_RECURSION:
if ((refsign = *ptr) == '+')
{
ptr++;
if ((digitab[*ptr] & ctype_digit) == 0)
{
*errorcodeptr = ERR63;
goto FAILED;
}
}
else if (refsign == '-')
{
if ((digitab[ptr[1]] & ctype_digit) == 0)
goto OTHER_CHAR_AFTER_QUERY;
ptr++;
}
recno = 0;
while((digitab[*ptr] & ctype_digit) != 0)
recno = recno * 10 + *ptr++ - '0';
if (*ptr != terminator)
{
*errorcodeptr = ERR29;
goto FAILED;
}
if (refsign == '-')
{
if (recno == 0)
{
*errorcodeptr = ERR58;
goto FAILED;
}
recno = cd->bracount - recno + 1;
if (recno <= 0)
{
*errorcodeptr = ERR15;
goto FAILED;
}
}
else if (refsign == '+')
{
if (recno == 0)
{
*errorcodeptr = ERR58;
goto FAILED;
}
recno += cd->bracount;
}
/* Come here from code above that handles a named recursion */
HANDLE_RECURSION:
previous = code;
called = cd->start_code;
/* When we are actually compiling, find the bracket that is being
referenced. Temporarily end the regex in case it doesn't exist before
this point. If we end up with a forward reference, first check that
the bracket does occur later so we can give the error (and position)
now. Then remember this forward reference in the workspace so it can
be filled in at the end. */
if (lengthptr == NULL)
{
*code = OP_END;
if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
/* Forward reference */
if (called == NULL)
{
if (find_parens(ptr, cd, NULL, recno,
(options & PCRE_EXTENDED) != 0) < 0)
{
*errorcodeptr = ERR15;
goto FAILED;
}
called = cd->start_code + recno;
PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
}
/* If not a forward reference, and the subpattern is still open,
this is a recursive call. We check to see if this is a left
recursion that could loop for ever, and diagnose that case. */
else if (GET(called, 1) == 0 &&
could_be_empty(called, code, bcptr, utf8))
{
*errorcodeptr = ERR40;
goto FAILED;
}
}
/* Insert the recursion/subroutine item, automatically wrapped inside
"once" brackets. Set up a "previous group" length so that a
subsequent quantifier will work. */
*code = OP_ONCE;
PUT(code, 1, 2 + 2*LINK_SIZE);
code += 1 + LINK_SIZE;
*code = OP_RECURSE;
PUT(code, 1, called - cd->start_code);
code += 1 + LINK_SIZE;
*code = OP_KET;
PUT(code, 1, 2 + 2*LINK_SIZE);
code += 1 + LINK_SIZE;
length_prevgroup = 3 + 3*LINK_SIZE;
}
/* Can't determine a first byte now */
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
continue;
/* ------------------------------------------------------------ */
default: /* Other characters: check option setting */
OTHER_CHAR_AFTER_QUERY:
set = unset = 0;
optset = &set;
while (*ptr != ')' && *ptr != ':')
{
switch (*ptr++)
{
case '-': optset = &unset; break;
case 'J': /* Record that it changed in the external options */
*optset |= PCRE_DUPNAMES;
cd->external_flags |= PCRE_JCHANGED;
break;
case 'i': *optset |= PCRE_CASELESS; break;
case 'm': *optset |= PCRE_MULTILINE; break;
case 's': *optset |= PCRE_DOTALL; break;
case 'x': *optset |= PCRE_EXTENDED; break;
case 'U': *optset |= PCRE_UNGREEDY; break;
case 'X': *optset |= PCRE_EXTRA; break;
default: *errorcodeptr = ERR12;
ptr--; /* Correct the offset */
goto FAILED;
}
}
/* Set up the changed option bits, but don't change anything yet. */
newoptions = (options | set) & (~unset);
/* If the options ended with ')' this is not the start of a nested
group with option changes, so the options change at this level. If this
item is right at the start of the pattern, the options can be
abstracted and made external in the pre-compile phase, and ignored in
the compile phase. This can be helpful when matching -- for instance in
caseless checking of required bytes.
If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
definitely *not* at the start of the pattern because something has been
compiled. In the pre-compile phase, however, the code pointer can have
that value after the start, because it gets reset as code is discarded
during the pre-compile. However, this can happen only at top level - if
we are within parentheses, the starting BRA will still be present. At
any parenthesis level, the length value can be used to test if anything
has been compiled at that level. Thus, a test for both these conditions
is necessary to ensure we correctly detect the start of the pattern in
both phases.
If we are not at the pattern start, compile code to change the ims
options if this setting actually changes any of them. We also pass the
new setting back so that it can be put at the start of any following
branches, and when this group ends (if we are in a group), a resetting
item can be compiled. */
if (*ptr == ')')
{
if (code == cd->start_code + 1 + LINK_SIZE &&
(lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
{
cd->external_options = newoptions;
options = newoptions;
}
else
{
if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
{
*code++ = OP_OPT;
*code++ = newoptions & PCRE_IMS;
}
/* Change options at this level, and pass them back for use
in subsequent branches. Reset the greedy defaults and the case
value for firstbyte and reqbyte. */
*optionsptr = options = newoptions;
greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
greedy_non_default = greedy_default ^ 1;
req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
}
previous = NULL; /* This item can't be repeated */
continue; /* It is complete */
}
/* If the options ended with ':' we are heading into a nested group
with possible change of options. Such groups are non-capturing and are
not assertions of any kind. All we need to do is skip over the ':';
the newoptions value is handled below. */
bravalue = OP_BRA;
ptr++;
} /* End of switch for character following (? */
} /* End of (? handling */
/* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
all unadorned brackets become non-capturing and behave like (?:...)
brackets. */
else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
{
bravalue = OP_BRA;
}
/* Else we have a capturing group. */
else
{
NUMBERED_GROUP:
cd->bracount += 1;
PUT2(code, 1+LINK_SIZE, cd->bracount);
skipbytes = 2;
}
/* Process nested bracketed regex. Assertions may not be repeated, but
other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
non-register variable in order to be able to pass its address because some
compilers complain otherwise. Pass in a new setting for the ims options if
they have changed. */
previous = (bravalue >= OP_ONCE)? code : NULL;
*code = bravalue;
tempcode = code;
tempreqvary = cd->req_varyopt; /* Save value before bracket */
length_prevgroup = 0; /* Initialize for pre-compile phase */
if (!compile_regex(
newoptions, /* The complete new option state */
options & PCRE_IMS, /* The previous ims option state */
&tempcode, /* Where to put code (updated) */
&ptr, /* Input pointer (updated) */
errorcodeptr, /* Where to put an error message */
(bravalue == OP_ASSERTBACK ||
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
reset_bracount, /* True if (?| group */
skipbytes, /* Skip over bracket number */
&subfirstbyte, /* For possible first char */
&subreqbyte, /* For possible last char */
bcptr, /* Current branch chain */
cd, /* Tables block */
(lengthptr == NULL)? NULL : /* Actual compile phase */
&length_prevgroup /* Pre-compile phase */
))
goto FAILED;
/* At the end of compiling, code is still pointing to the start of the
group, while tempcode has been updated to point past the end of the group
and any option resetting that may follow it. The pattern pointer (ptr)
is on the bracket. */
/* If this is a conditional bracket, check that there are no more than
two branches in the group, or just one if it's a DEFINE group. We do this
in the real compile phase, not in the pre-pass, where the whole group may
not be available. */
if (bravalue == OP_COND && lengthptr == NULL)
{
uschar *tc = code;
int condcount = 0;
do {
condcount++;
tc += GET(tc,1);
}
while (*tc != OP_KET);
/* A DEFINE group is never obeyed inline (the "condition" is always
false). It must have only one branch. */
if (code[LINK_SIZE+1] == OP_DEF)
{
if (condcount > 1)
{
*errorcodeptr = ERR54;
goto FAILED;
}
bravalue = OP_DEF; /* Just a flag to suppress char handling below */
}
/* A "normal" conditional group. If there is just one branch, we must not
make use of its firstbyte or reqbyte, because this is equivalent to an
empty second branch. */
else
{
if (condcount > 2)
{
*errorcodeptr = ERR27;
goto FAILED;
}
if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
}
}
/* Error if hit end of pattern */
if (*ptr != ')')
{
*errorcodeptr = ERR14;
goto FAILED;
}
/* In the pre-compile phase, update the length by the length of the group,
less the brackets at either end. Then reduce the compiled code to just a
set of non-capturing brackets so that it doesn't use much memory if it is
duplicated by a quantifier.*/
if (lengthptr != NULL)
{
if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
{
*errorcodeptr = ERR20;
goto FAILED;
}
*lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
*code++ = OP_BRA;
PUTINC(code, 0, 1 + LINK_SIZE);
*code++ = OP_KET;
PUTINC(code, 0, 1 + LINK_SIZE);
break; /* No need to waste time with special character handling */
}
/* Otherwise update the main code pointer to the end of the group. */
code = tempcode;
/* For a DEFINE group, required and first character settings are not
relevant. */
if (bravalue == OP_DEF) break;
/* Handle updating of the required and first characters for other types of
group. Update for normal brackets of all kinds, and conditions with two
branches (see code above). If the bracket is followed by a quantifier with
zero repeat, we have to back off. Hence the definition of zeroreqbyte and
zerofirstbyte outside the main loop so that they can be accessed for the
back off. */
zeroreqbyte = reqbyte;
zerofirstbyte = firstbyte;
groupsetfirstbyte = FALSE;
if (bravalue >= OP_ONCE)
{
/* If we have not yet set a firstbyte in this branch, take it from the
subpattern, remembering that it was set here so that a repeat of more
than one can replicate it as reqbyte if necessary. If the subpattern has
no firstbyte, set "none" for the whole branch. In both cases, a zero
repeat forces firstbyte to "none". */
if (firstbyte == REQ_UNSET)
{
if (subfirstbyte >= 0)
{
firstbyte = subfirstbyte;
groupsetfirstbyte = TRUE;
}
else firstbyte = REQ_NONE;
zerofirstbyte = REQ_NONE;
}
/* If firstbyte was previously set, convert the subpattern's firstbyte
into reqbyte if there wasn't one, using the vary flag that was in
existence beforehand. */
else if (subfirstbyte >= 0 && subreqbyte < 0)
subreqbyte = subfirstbyte | tempreqvary;
/* If the subpattern set a required byte (or set a first byte that isn't
really the first byte - see above), set it. */
if (subreqbyte >= 0) reqbyte = subreqbyte;
}
/* For a forward assertion, we take the reqbyte, if set. This can be
helpful if the pattern that follows the assertion doesn't set a different
char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
for an assertion, however because it leads to incorrect effect for patterns
such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
of a firstbyte. This is overcome by a scan at the end if there's no
firstbyte, looking for an asserted first char. */
else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
break; /* End of processing '(' */
/* ===================================================================*/
/* Handle metasequences introduced by \. For ones like \d, the ESC_ values
are arranged to be the negation of the corresponding OP_values. For the
back references, the values are ESC_REF plus the reference number. Only
back references and those types that consume a character may be repeated.
We can test for values between ESC_b and ESC_Z for the latter; this may
have to change if any new ones are ever created. */
case '\\':
tempptr = ptr;
c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
if (*errorcodeptr != 0) goto FAILED;
if (c < 0)
{
if (-c == ESC_Q) /* Handle start of quoted string */
{
if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
else inescq = TRUE;
continue;
}
if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
/* For metasequences that actually match a character, we disable the
setting of a first character if it hasn't already been set. */
if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
firstbyte = REQ_NONE;
/* Set values to reset to if this is followed by a zero repeat. */
zerofirstbyte = firstbyte;
zeroreqbyte = reqbyte;
/* \g or \g'name' is a subroutine call by name and \g or \g'n'
is a subroutine call by number (Oniguruma syntax). In fact, the value
-ESC_g is returned only for these cases. So we don't need to check for <
or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
-ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
that is a synonym for a named back reference). */
if (-c == ESC_g)
{
const uschar *p;
save_hwm = cd->hwm; /* Normally this is set when '(' is read */
terminator = (*(++ptr) == '<')? '>' : '\'';
/* These two statements stop the compiler for warning about possibly
unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
fact, because we actually check for a number below, the paths that
would actually be in error are never taken. */
skipbytes = 0;
reset_bracount = FALSE;
/* Test for a name */
if (ptr[1] != '+' && ptr[1] != '-')
{
BOOL isnumber = TRUE;
for (p = ptr + 1; *p != 0 && *p != terminator; p++)
{
if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
if ((cd->ctypes[*p] & ctype_word) == 0) break;
}
if (*p != terminator)
{
*errorcodeptr = ERR57;
break;
}
if (isnumber)
{
ptr++;
goto HANDLE_NUMERICAL_RECURSION;
}
is_recurse = TRUE;
goto NAMED_REF_OR_RECURSE;
}
/* Test a signed number in angle brackets or quotes. */
p = ptr + 2;
while ((digitab[*p] & ctype_digit) != 0) p++;
if (*p != terminator)
{
*errorcodeptr = ERR57;
break;
}
ptr++;
goto HANDLE_NUMERICAL_RECURSION;
}
/* \k or \k'name' is a back reference by name (Perl syntax).
We also support \k{name} (.NET syntax) */
if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
{
is_recurse = FALSE;
terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
goto NAMED_REF_OR_RECURSE;
}
/* Back references are handled specially; must disable firstbyte if
not set to cope with cases like (?=(\w+))\1: which would otherwise set
':' later. */
if (-c >= ESC_REF)
{
recno = -c - ESC_REF;
HANDLE_REFERENCE: /* Come here from named backref handling */
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
previous = code;
*code++ = OP_REF;
PUT2INC(code, 0, recno);
cd->backref_map |= (recno < 32)? (1 << recno) : 1;
if (recno > cd->top_backref) cd->top_backref = recno;
}
/* So are Unicode property matches, if supported. */
#ifdef SUPPORT_UCP
else if (-c == ESC_P || -c == ESC_p)
{
BOOL negated;
int pdata;
int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
if (ptype < 0) goto FAILED;
previous = code;
*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
*code++ = ptype;
*code++ = pdata;
}
#else
/* If Unicode properties are not supported, \X, \P, and \p are not
allowed. */
else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
{
*errorcodeptr = ERR45;
goto FAILED;
}
#endif
/* For the rest (including \X when Unicode properties are supported), we
can obtain the OP value by negating the escape value. */
else
{
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
*code++ = -c;
}
continue;
}
/* We have a data character whose value is in c. In UTF-8 mode it may have
a value > 127. We set its representation in the length/buffer, and then
handle it as a data character. */
#ifdef SUPPORT_UTF8
if (utf8 && c > 127)
mclength = _pcre_ord2utf8(c, mcbuffer);
else
#endif
{
mcbuffer[0] = c;
mclength = 1;
}
goto ONE_CHAR;
/* ===================================================================*/
/* Handle a literal character. It is guaranteed not to be whitespace or #
when the extended flag is set. If we are in UTF-8 mode, it may be a
multi-byte literal character. */
default:
NORMAL_CHAR:
mclength = 1;
mcbuffer[0] = c;
#ifdef SUPPORT_UTF8
if (utf8 && c >= 0xc0)
{
while ((ptr[1] & 0xc0) == 0x80)
mcbuffer[mclength++] = *(++ptr);
}
#endif
/* At this point we have the character's bytes in mcbuffer, and the length
in mclength. When not in UTF-8 mode, the length is always 1. */
ONE_CHAR:
previous = code;
*code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
/* Remember if \r or \n were seen */
if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
cd->external_flags |= PCRE_HASCRORLF;
/* Set the first and required bytes appropriately. If no previous first
byte, set it from this character, but revert to none on a zero repeat.
Otherwise, leave the firstbyte value alone, and don't change it on a zero
repeat. */
if (firstbyte == REQ_UNSET)
{
zerofirstbyte = REQ_NONE;
zeroreqbyte = reqbyte;
/* If the character is more than one byte long, we can set firstbyte
only if it is not to be matched caselessly. */
if (mclength == 1 || req_caseopt == 0)
{
firstbyte = mcbuffer[0] | req_caseopt;
if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
}
else firstbyte = reqbyte = REQ_NONE;
}
/* firstbyte was previously set; we can set reqbyte only the length is
1 or the matching is caseful. */
else
{
zerofirstbyte = firstbyte;
zeroreqbyte = reqbyte;
if (mclength == 1 || req_caseopt == 0)
reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
}
break; /* End of literal character handling */
}
} /* end of big loop */
/* Control never reaches here by falling through, only by a goto for all the
error states. Pass back the position in the pattern so that it can be displayed
to the user for diagnosing the error. */
FAILED:
*ptrptr = ptr;
return FALSE;
}
/*************************************************
* Compile sequence of alternatives *
*************************************************/
/* On entry, ptr is pointing past the bracket character, but on return it
points to the closing bracket, or vertical bar, or end of string. The code
variable is pointing at the byte into which the BRA operator has been stored.
If the ims options are changed at the start (for a (?ims: group) or during any
branch, we need to insert an OP_OPT item at the start of every following branch
to ensure they get set correctly at run time, and also pass the new options
into every subsequent branch compile.
This function is used during the pre-compile phase when we are trying to find
out the amount of memory needed, as well as during the real compile phase. The
value of lengthptr distinguishes the two phases.
Arguments:
options option bits, including any changes for this subpattern
oldims previous settings of ims option bits
codeptr -> the address of the current code pointer
ptrptr -> the address of the current pattern pointer
errorcodeptr -> pointer to error code variable
lookbehind TRUE if this is a lookbehind assertion
reset_bracount TRUE to reset the count for each branch
skipbytes skip this many bytes at start (for brackets and OP_COND)
firstbyteptr place to put the first required character, or a negative number
reqbyteptr place to put the last required character, or a negative number
bcptr pointer to the chain of currently open branches
cd points to the data block with tables pointers etc.
lengthptr NULL during the real compile phase
points to length accumulator during pre-compile phase
Returns: TRUE on success
*/
static BOOL
compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
int *lengthptr)
{
const uschar *ptr = *ptrptr;
uschar *code = *codeptr;
uschar *last_branch = code;
uschar *start_bracket = code;
uschar *reverse_count = NULL;
int firstbyte, reqbyte;
int branchfirstbyte, branchreqbyte;
int length;
int orig_bracount;
int max_bracount;
branch_chain bc;
bc.outer = bcptr;
bc.current = code;
firstbyte = reqbyte = REQ_UNSET;
/* Accumulate the length for use in the pre-compile phase. Start with the
length of the BRA and KET and any extra bytes that are required at the
beginning. We accumulate in a local variable to save frequent testing of
lenthptr for NULL. We cannot do this by looking at the value of code at the
start and end of each alternative, because compiled items are discarded during
the pre-compile phase so that the work space is not exceeded. */
length = 2 + 2*LINK_SIZE + skipbytes;
/* WARNING: If the above line is changed for any reason, you must also change
the code that abstracts option settings at the start of the pattern and makes
them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
pre-compile phase to find out whether anything has yet been compiled or not. */
/* Offset is set zero to mark that this bracket is still open */
PUT(code, 1, 0);
code += 1 + LINK_SIZE + skipbytes;
/* Loop for each alternative branch */
orig_bracount = max_bracount = cd->bracount;
for (;;)
{
/* For a (?| group, reset the capturing bracket count so that each branch
uses the same numbers. */
if (reset_bracount) cd->bracount = orig_bracount;
/* Handle a change of ims options at the start of the branch */
if ((options & PCRE_IMS) != oldims)
{
*code++ = OP_OPT;
*code++ = options & PCRE_IMS;
length += 2;
}
/* Set up dummy OP_REVERSE if lookbehind assertion */
if (lookbehind)
{
*code++ = OP_REVERSE;
reverse_count = code;
PUTINC(code, 0, 0);
length += 1 + LINK_SIZE;
}
/* Now compile the branch; in the pre-compile phase its length gets added
into the length. */
if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
&branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
{
*ptrptr = ptr;
return FALSE;
}
/* Keep the highest bracket count in case (?| was used and some branch
has fewer than the rest. */
if (cd->bracount > max_bracount) max_bracount = cd->bracount;
/* In the real compile phase, there is some post-processing to be done. */
if (lengthptr == NULL)
{
/* If this is the first branch, the firstbyte and reqbyte values for the
branch become the values for the regex. */
if (*last_branch != OP_ALT)
{
firstbyte = branchfirstbyte;
reqbyte = branchreqbyte;
}
/* If this is not the first branch, the first char and reqbyte have to
match the values from all the previous branches, except that if the
previous value for reqbyte didn't have REQ_VARY set, it can still match,
and we set REQ_VARY for the regex. */
else
{
/* If we previously had a firstbyte, but it doesn't match the new branch,
we have to abandon the firstbyte for the regex, but if there was
previously no reqbyte, it takes on the value of the old firstbyte. */
if (firstbyte >= 0 && firstbyte != branchfirstbyte)
{
if (reqbyte < 0) reqbyte = firstbyte;
firstbyte = REQ_NONE;
}
/* If we (now or from before) have no firstbyte, a firstbyte from the
branch becomes a reqbyte if there isn't a branch reqbyte. */
if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
branchreqbyte = branchfirstbyte;
/* Now ensure that the reqbytes match */
if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
reqbyte = REQ_NONE;
else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
}
/* If lookbehind, check that this branch matches a fixed-length string, and
put the length into the OP_REVERSE item. Temporarily mark the end of the
branch with OP_END. */
if (lookbehind)
{
int fixed_length;
*code = OP_END;
fixed_length = find_fixedlength(last_branch, options);
DPRINTF(("fixed length = %d\n", fixed_length));
if (fixed_length < 0)
{
*errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
*ptrptr = ptr;
return FALSE;
}
PUT(reverse_count, 0, fixed_length);
}
}
/* Reached end of expression, either ')' or end of pattern. In the real
compile phase, go back through the alternative branches and reverse the chain
of offsets, with the field in the BRA item now becoming an offset to the
first alternative. If there are no alternatives, it points to the end of the
group. The length in the terminating ket is always the length of the whole
bracketed item. If any of the ims options were changed inside the group,
compile a resetting op-code following, except at the very end of the pattern.
Return leaving the pointer at the terminating char. */
if (*ptr != '|')
{
if (lengthptr == NULL)
{
int branch_length = code - last_branch;
do
{
int prev_length = GET(last_branch, 1);
PUT(last_branch, 1, branch_length);
branch_length = prev_length;
last_branch -= branch_length;
}
while (branch_length > 0);
}
/* Fill in the ket */
*code = OP_KET;
PUT(code, 1, code - start_bracket);
code += 1 + LINK_SIZE;
/* Resetting option if needed */
if ((options & PCRE_IMS) != oldims && *ptr == ')')
{
*code++ = OP_OPT;
*code++ = oldims;
length += 2;
}
/* Retain the highest bracket number, in case resetting was used. */
cd->bracount = max_bracount;
/* Set values to pass back */
*codeptr = code;
*ptrptr = ptr;
*firstbyteptr = firstbyte;
*reqbyteptr = reqbyte;
if (lengthptr != NULL)
{
if (OFLOW_MAX - *lengthptr < length)
{
*errorcodeptr = ERR20;
return FALSE;
}
*lengthptr += length;
}
return TRUE;
}
/* Another branch follows. In the pre-compile phase, we can move the code
pointer back to where it was for the start of the first branch. (That is,
pretend that each branch is the only one.)
In the real compile phase, insert an ALT node. Its length field points back
to the previous branch while the bracket remains open. At the end the chain
is reversed. It's done like this so that the start of the bracket has a
zero offset until it is closed, making it possible to detect recursion. */
if (lengthptr != NULL)
{
code = *codeptr + 1 + LINK_SIZE + skipbytes;
length += 1 + LINK_SIZE;
}
else
{
*code = OP_ALT;
PUT(code, 1, code - last_branch);
bc.current = last_branch = code;
code += 1 + LINK_SIZE;
}
ptr++;
}
/* Control never reaches here */
}
/*************************************************
* Check for anchored expression *
*************************************************/
/* Try to find out if this is an anchored regular expression. Consider each
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
it's anchored. However, if this is a multiline pattern, then only OP_SOD
counts, since OP_CIRC can match in the middle.
We can also consider a regex to be anchored if OP_SOM starts all its branches.
This is the code for \G, which means "match at start of match position, taking
into account the match offset".
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
because that will try the rest of the pattern at all possible matching points,
so there is no point trying again.... er ....
.... except when the .* appears inside capturing parentheses, and there is a
subsequent back reference to those parentheses. We haven't enough information
to catch that case precisely.
At first, the best we could do was to detect when .* was in capturing brackets
and the highest back reference was greater than or equal to that level.
However, by keeping a bitmap of the first 31 back references, we can catch some
of the more common cases more precisely.
Arguments:
code points to start of expression (the bracket)
options points to the options setting
bracket_map a bitmap of which brackets we are inside while testing; this
handles up to substring 31; after that we just have to take
the less precise approach
backref_map the back reference bitmap
Returns: TRUE or FALSE
*/
static BOOL
is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
unsigned int backref_map)
{
do {
const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
options, PCRE_MULTILINE, FALSE);
register int op = *scode;
/* Non-capturing brackets */
if (op == OP_BRA)
{
if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
}
/* Capturing brackets */
else if (op == OP_CBRA)
{
int n = GET2(scode, 1+LINK_SIZE);
int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
}
/* Other brackets */
else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
{
if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
}
/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
it isn't in brackets that are or may be referenced. */
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
op == OP_TYPEPOSSTAR))
{
if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
return FALSE;
}
/* Check for explicit anchoring */
else if (op != OP_SOD && op != OP_SOM &&
((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
return FALSE;
code += GET(code, 1);
}
while (*code == OP_ALT); /* Loop for each alternative */
return TRUE;
}
/*************************************************
* Check for starting with ^ or .* *
*************************************************/
/* This is called to find out if every branch starts with ^ or .* so that
"first char" processing can be done to speed things up in multiline
matching and for non-DOTALL patterns that start with .* (which must start at
the beginning or after \n). As in the case of is_anchored() (see above), we
have to take account of back references to capturing brackets that contain .*
because in that case we can't make the assumption.
Arguments:
code points to start of expression (the bracket)
bracket_map a bitmap of which brackets we are inside while testing; this
handles up to substring 31; after that we just have to take
the less precise approach
backref_map the back reference bitmap
Returns: TRUE or FALSE
*/
static BOOL
is_startline(const uschar *code, unsigned int bracket_map,
unsigned int backref_map)
{
do {
const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
NULL, 0, FALSE);
register int op = *scode;
/* Non-capturing brackets */
if (op == OP_BRA)
{
if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
}
/* Capturing brackets */
else if (op == OP_CBRA)
{
int n = GET2(scode, 1+LINK_SIZE);
int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
if (!is_startline(scode, new_map, backref_map)) return FALSE;
}
/* Other brackets */
else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
{ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
/* .* means "start at start or after \n" if it isn't in brackets that
may be referenced. */
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
{
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
}
/* Check for explicit circumflex */
else if (op != OP_CIRC) return FALSE;
/* Move on to the next alternative */
code += GET(code, 1);
}
while (*code == OP_ALT); /* Loop for each alternative */
return TRUE;
}
/*************************************************
* Check for asserted fixed first char *
*************************************************/
/* During compilation, the "first char" settings from forward assertions are
discarded, because they can cause conflicts with actual literals that follow.
However, if we end up without a first char setting for an unanchored pattern,
it is worth scanning the regex to see if there is an initial asserted first
char. If all branches start with the same asserted char, or with a bracket all
of whose alternatives start with the same asserted char (recurse ad lib), then
we return that char, otherwise -1.
Arguments:
code points to start of expression (the bracket)
options pointer to the options (used to check casing changes)
inassert TRUE if in an assertion
Returns: -1 or the fixed first char
*/
static int
find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
{
register int c = -1;
do {
int d;
const uschar *scode =
first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
register int op = *scode;
switch(op)
{
default:
return -1;
case OP_BRA:
case OP_CBRA:
case OP_ASSERT:
case OP_ONCE:
case OP_COND:
if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
return -1;
if (c < 0) c = d; else if (c != d) return -1;
break;
case OP_EXACT: /* Fall through */
scode += 2;
case OP_CHAR:
case OP_CHARNC:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
if (!inassert) return -1;
if (c < 0)
{
c = scode[1];
if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
}
else if (c != scode[1]) return -1;
break;
}
code += GET(code, 1);
}
while (*code == OP_ALT);
return c;
}
/*************************************************
* Compile a Regular Expression *
*************************************************/
/* This function takes a string and returns a pointer to a block of store
holding a compiled version of the expression. The original API for this
function had no error code return variable; it is retained for backwards
compatibility. The new function is given a new name.
Arguments:
pattern the regular expression
options various option bits
errorcodeptr pointer to error code variable (pcre_compile2() only)
can be NULL if you don't want a code value
errorptr pointer to pointer to error text
erroroffset ptr offset in pattern where error was detected
tables pointer to character tables or NULL
Returns: pointer to compiled data block, or NULL on error,
with errorptr and erroroffset set
*/
PCRE_EXP_DEFN pcre *
pcre_compile(const char *pattern, int options, const char **errorptr,
int *erroroffset, const unsigned char *tables)
{
return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
}
PCRE_EXP_DEFN pcre *
pcre_compile2(const char *pattern, int options, int *errorcodeptr,
const char **errorptr, int *erroroffset, const unsigned char *tables)
{
real_pcre *re;
int length = 1; /* For final END opcode */
int firstbyte, reqbyte, newline;
int errorcode = 0;
int skipatstart = 0;
#ifdef SUPPORT_UTF8
BOOL utf8;
#endif
size_t size;
uschar *code;
const uschar *codestart;
const uschar *ptr;
compile_data compile_block;
compile_data *cd = &compile_block;
/* This space is used for "compiling" into during the first phase, when we are
computing the amount of memory that is needed. Compiled items are thrown away
as soon as possible, so that a fairly large buffer should be sufficient for
this purpose. The same space is used in the second phase for remembering where
to fill in forward references to subpatterns. */
uschar cworkspace[COMPILE_WORK_SIZE];
/* Set this early so that early errors get offset 0. */
ptr = (const uschar *)pattern;
/* We can't pass back an error message if errorptr is NULL; I guess the best we
can do is just return NULL, but we can set a code value if there is a code
pointer. */
if (errorptr == NULL)
{
if (errorcodeptr != NULL) *errorcodeptr = 99;
return NULL;
}
*errorptr = NULL;
if (errorcodeptr != NULL) *errorcodeptr = ERR0;
/* However, we can give a message for this error */
if (erroroffset == NULL)
{
errorcode = ERR16;
goto PCRE_EARLY_ERROR_RETURN2;
}
*erroroffset = 0;
/* Can't support UTF8 unless PCRE has been compiled to include the code. */
#ifdef SUPPORT_UTF8
utf8 = (options & PCRE_UTF8) != 0;
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
(*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
{
errorcode = ERR44;
goto PCRE_EARLY_ERROR_RETURN2;
}
#else
if ((options & PCRE_UTF8) != 0)
{
errorcode = ERR32;
goto PCRE_EARLY_ERROR_RETURN;
}
#endif
if ((options & ~PUBLIC_OPTIONS) != 0)
{
errorcode = ERR17;
goto PCRE_EARLY_ERROR_RETURN;
}
/* Set up pointers to the individual character tables */
if (tables == NULL) tables = _pcre_default_tables;
cd->lcc = tables + lcc_offset;
cd->fcc = tables + fcc_offset;
cd->cbits = tables + cbits_offset;
cd->ctypes = tables + ctypes_offset;
/* Check for global one-time settings at the start of the pattern, and remember
the offset for later. */
while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
{
int newnl = 0;
int newbsr = 0;
if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
{ skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
{ skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
{ skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
{ skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
{ skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
{ skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
{ skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
if (newnl != 0)
options = (options & ~PCRE_NEWLINE_BITS) | newnl;
else if (newbsr != 0)
options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
else break;
}
/* Check validity of \R options. */
switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
{
case 0:
case PCRE_BSR_ANYCRLF:
case PCRE_BSR_UNICODE:
break;
default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
}
/* Handle different types of newline. The three bits give seven cases. The
current code allows for fixed one- or two-byte sequences, plus "any" and
"anycrlf". */
switch (options & PCRE_NEWLINE_BITS)
{
case 0: newline = NEWLINE; break; /* Build-time default */
case PCRE_NEWLINE_CR: newline = '\r'; break;
case PCRE_NEWLINE_LF: newline = '\n'; break;
case PCRE_NEWLINE_CR+
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
case PCRE_NEWLINE_ANY: newline = -1; break;
case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
}
if (newline == -2)
{
cd->nltype = NLTYPE_ANYCRLF;
}
else if (newline < 0)
{
cd->nltype = NLTYPE_ANY;
}
else
{
cd->nltype = NLTYPE_FIXED;
if (newline > 255)
{
cd->nllen = 2;
cd->nl[0] = (newline >> 8) & 255;
cd->nl[1] = newline & 255;
}
else
{
cd->nllen = 1;
cd->nl[0] = newline;
}
}
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
references to help in deciding whether (.*) can be treated as anchored or not.
*/
cd->top_backref = 0;
cd->backref_map = 0;
/* Reflect pattern for debugging output */
DPRINTF(("------------------------------------------------------------------\n"));
DPRINTF(("%s\n", pattern));
/* Pretend to compile the pattern while actually just accumulating the length
of memory required. This behaviour is triggered by passing a non-NULL final
argument to compile_regex(). We pass a block of workspace (cworkspace) for it
to compile parts of the pattern into; the compiled code is discarded when it is
no longer needed, so hopefully this workspace will never overflow, though there
is a test for its doing so. */
cd->bracount = cd->final_bracount = 0;
cd->names_found = 0;
cd->name_entry_size = 0;
cd->name_table = NULL;
cd->start_workspace = cworkspace;
cd->start_code = cworkspace;
cd->hwm = cworkspace;
cd->start_pattern = (const uschar *)pattern;
cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
cd->req_varyopt = 0;
cd->external_options = options;
cd->external_flags = 0;
/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
don't need to look at the result of the function here. The initial options have
been put into the cd block so that they can be changed if an option setting is
found within the regex right at the beginning. Bringing initial option settings
outside can help speed up starting point checks. */
ptr += skipatstart;
code = cworkspace;
*code = OP_BRA;
(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
&code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
&length);
if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
cd->hwm - cworkspace));
if (length > MAX_PATTERN_SIZE)
{
errorcode = ERR20;
goto PCRE_EARLY_ERROR_RETURN;
}
/* Compute the size of data block needed and get it, either from malloc or
externally provided function. Integer overflow should no longer be possible
because nowadays we limit the maximum value of cd->names_found and
cd->name_entry_size. */
size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
re = (real_pcre *)(pcre_malloc)(size);
if (re == NULL)
{
errorcode = ERR21;
goto PCRE_EARLY_ERROR_RETURN;
}
/* Put in the magic number, and save the sizes, initial options, internal
flags, and character table pointer. NULL is used for the default character
tables. The nullpad field is at the end; it's there to help in the case when a
regex compiled on a system with 4-byte pointers is run on another with 8-byte
pointers. */
re->magic_number = MAGIC_NUMBER;
re->size = size;
re->options = cd->external_options;
re->flags = cd->external_flags;
re->dummy1 = 0;
re->first_byte = 0;
re->req_byte = 0;
re->name_table_offset = sizeof(real_pcre);
re->name_entry_size = cd->name_entry_size;
re->name_count = cd->names_found;
re->ref_count = 0;
re->tables = (tables == _pcre_default_tables)? NULL : tables;
re->nullpad = NULL;
/* The starting points of the name/number translation table and of the code are
passed around in the compile data block. The start/end pattern and initial
options are already set from the pre-compile phase, as is the name_entry_size
field. Reset the bracket count and the names_found field. Also reset the hwm
field; this time it's used for remembering forward references to subpatterns.
*/
cd->final_bracount = cd->bracount; /* Save for checking forward references */
cd->bracount = 0;
cd->names_found = 0;
cd->name_table = (uschar *)re + re->name_table_offset;
codestart = cd->name_table + re->name_entry_size * re->name_count;
cd->start_code = codestart;
cd->hwm = cworkspace;
cd->req_varyopt = 0;
cd->had_accept = FALSE;
/* Set up a starting, non-extracting bracket, then compile the expression. On
error, errorcode will be set non-zero, so we don't need to look at the result
of the function here. */
ptr = (const uschar *)pattern + skipatstart;
code = (uschar *)codestart;
*code = OP_BRA;
(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
&errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
re->top_bracket = cd->bracount;
re->top_backref = cd->top_backref;
re->flags = cd->external_flags;
if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
/* If not reached end of pattern on success, there's an excess bracket. */
if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
/* Fill in the terminating state and check for disastrous overflow, but
if debugging, leave the test till after things are printed out. */
*code++ = OP_END;
#ifndef DEBUG
if (code - codestart > length) errorcode = ERR23;
#endif
/* Fill in any forward references that are required. */
while (errorcode == 0 && cd->hwm > cworkspace)
{
int offset, recno;
const uschar *groupptr;
cd->hwm -= LINK_SIZE;
offset = GET(cd->hwm, 0);
recno = GET(codestart, offset);
groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
if (groupptr == NULL) errorcode = ERR53;
else PUT(((uschar *)codestart), offset, groupptr - codestart);
}
/* Give an error if there's back reference to a non-existent capturing
subpattern. */
if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
/* Failed to compile, or error while post-processing */
if (errorcode != 0)
{
(pcre_free)(re);
PCRE_EARLY_ERROR_RETURN:
*erroroffset = ptr - (const uschar *)pattern;
PCRE_EARLY_ERROR_RETURN2:
*errorptr = find_error_text(errorcode);
if (errorcodeptr != NULL) *errorcodeptr = errorcode;
return NULL;
}
/* If the anchored option was not passed, set the flag if we can determine that
the pattern is anchored by virtue of ^ characters or \A or anything else (such
as starting with .* when DOTALL is set).
Otherwise, if we know what the first byte has to be, save it, because that
speeds up unanchored matches no end. If not, see if we can set the
PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
start with ^. and also when all branches start with .* for non-DOTALL matches.
*/
if ((re->options & PCRE_ANCHORED) == 0)
{
int temp_options = re->options; /* May get changed during these scans */
if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
re->options |= PCRE_ANCHORED;
else
{
if (firstbyte < 0)
firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
{
int ch = firstbyte & 255;
re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
cd->fcc[ch] == ch)? ch : firstbyte;
re->flags |= PCRE_FIRSTSET;
}
else if (is_startline(codestart, 0, cd->backref_map))
re->flags |= PCRE_STARTLINE;
}
}
/* For an anchored pattern, we use the "required byte" only if it follows a
variable length item in the regex. Remove the caseless flag for non-caseable
bytes. */
if (reqbyte >= 0 &&
((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
{
int ch = reqbyte & 255;
re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
re->flags |= PCRE_REQCHSET;
}
/* Print out the compiled data if debugging is enabled. This is never the
case when building a production library. */
#ifdef DEBUG
printf("Length = %d top_bracket = %d top_backref = %d\n",
length, re->top_bracket, re->top_backref);
printf("Options=%08x\n", re->options);
if ((re->flags & PCRE_FIRSTSET) != 0)
{
int ch = re->first_byte & 255;
const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
"" : " (caseless)";
if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
else printf("First char = \\x%02x%s\n", ch, caseless);
}
if ((re->flags & PCRE_REQCHSET) != 0)
{
int ch = re->req_byte & 255;
const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
"" : " (caseless)";
if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
else printf("Req char = \\x%02x%s\n", ch, caseless);
}
pcre_printint(re, stdout, TRUE);
/* This check is done here in the debugging case so that the code that
was compiled can be seen. */
if (code - codestart > length)
{
(pcre_free)(re);
*errorptr = find_error_text(ERR23);
*erroroffset = ptr - (uschar *)pattern;
if (errorcodeptr != NULL) *errorcodeptr = ERR23;
return NULL;
}
#endif /* DEBUG */
return (pcre *)re;
}
| pcrecomp.c | 2406 |
pcreconf.c |
Type | Function | Source | Line |
PCRE_EXP_DEFN INT | pcre_config(int what, void *where)
PCRE_EXP_DEFN int
pcre_config(int what, void *where)
{
switch (what)
{
case PCRE_CONFIG_UTF8:
#ifdef SUPPORT_UTF8
*((int *)where) = 1;
#else
*((int *)where) = 0;
#endif
break;
case PCRE_CONFIG_UNICODE_PROPERTIES:
#ifdef SUPPORT_UCP
*((int *)where) = 1;
#else
*((int *)where) = 0;
#endif
break;
case PCRE_CONFIG_NEWLINE:
*((int *)where) = NEWLINE;
break;
case PCRE_CONFIG_BSR:
#ifdef BSR_ANYCRLF
*((int *)where) = 1;
#else
*((int *)where) = 0;
#endif
break;
case PCRE_CONFIG_LINK_SIZE:
*((int *)where) = LINK_SIZE;
break;
case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
*((int *)where) = POSIX_MALLOC_THRESHOLD;
break;
case PCRE_CONFIG_MATCH_LIMIT:
*((unsigned int *)where) = MATCH_LIMIT;
break;
case PCRE_CONFIG_MATCH_LIMIT_RECURSION:
*((unsigned int *)where) = MATCH_LIMIT_RECURSION;
break;
case PCRE_CONFIG_STACKRECURSE:
#ifdef NO_RECURSE
*((int *)where) = 0;
#else
*((int *)where) = 1;
#endif
break;
default: return PCRE_ERROR_BADOPTION;
}
return 0;
}
| pcreconf.c | 65 |
pcredfa.c |
Type | Function | Source | Line |
STATIC VOID | pchars(unsigned char *p, int length, FILE *f)
static void
pchars(unsigned char *p, int length, FILE *f)
{
int c;
while (length-- > 0)
{
if (isprint(c = *(p++)))
fprintf(f, "%c", c);
else
fprintf(f, "\\x%02x", c);
}
}
#endif
/*************************************************
* Execute a Regular Expression - DFA engine *
*************************************************/
/* This internal function applies a compiled pattern to a subject string,
starting at a given point, using a DFA engine. This function is called from the
external one, possibly multiple times if the pattern is not anchored. The
function calls itself recursively for some kinds of subpattern.
Arguments:
md the match_data block with fixed information
this_start_code the opening bracket of this subexpression's code
current_subject where we currently are in the subject string
start_offset start offset in the subject string
offsets vector to contain the matching string offsets
offsetcount size of same
workspace vector of workspace
wscount size of same
ims the current ims flags
rlevel function call recursion level
recursing regex recursive call level
Returns: > 0 => number of match offset pairs placed in offsets
= 0 => offsets overflowed; longest matches are present
-1 => failed to match
< -1 => some kind of unexpected problem
The following macros are used for adding states to the two state vectors (one
for the current character, one for the following character). */
#define ADD_ACTIVE(x,y) \
if (active_count++ < wscount) \
{ \
next_active_state->offset = (x); \
next_active_state->count = (y); \
next_active_state->ims = ims; \
next_active_state++; \
DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
} \
else return PCRE_ERROR_DFA_WSSIZE
#define ADD_ACTIVE_DATA(x,y,z) \
if (active_count++ < wscount) \
{ \
next_active_state->offset = (x); \
next_active_state->count = (y); \
next_active_state->ims = ims; \
next_active_state->data = (z); \
next_active_state++; \
DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
} \
else return PCRE_ERROR_DFA_WSSIZE
#define ADD_NEW(x,y) \
if (new_count++ < wscount) \
{ \
next_new_state->offset = (x); \
next_new_state->count = (y); \
next_new_state->ims = ims; \
next_new_state++; \
DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
} \
else return PCRE_ERROR_DFA_WSSIZE
#define ADD_NEW_DATA(x,y,z) \
if (new_count++ < wscount) \
{ \
next_new_state->offset = (x); \
next_new_state->count = (y); \
next_new_state->ims = ims; \
next_new_state->data = (z); \
next_new_state++; \
DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
} \
else return PCRE_ERROR_DFA_WSSIZE
| pcredfa.c | 188 |
ELSE RETURN PCRE_ERROR_DFA_WSSIZE STATIC INT | internal_dfa_exec( dfa_match_data *md, const uschar *this_start_code, const uschar *current_subject, int start_offset, int *offsets, int offsetcount, int *workspace, int wscount, int ims, int rlevel, int recursing)
static int
internal_dfa_exec(
dfa_match_data *md,
const uschar *this_start_code,
const uschar *current_subject,
int start_offset,
int *offsets,
int offsetcount,
int *workspace,
int wscount,
int ims,
int rlevel,
int recursing)
{
stateblock *active_states, *new_states, *temp_states;
stateblock *next_active_state, *next_new_state;
const uschar *ctypes, *lcc, *fcc;
const uschar *ptr;
const uschar *end_code, *first_op;
int active_count, new_count, match_count;
/* Some fields in the md block are frequently referenced, so we load them into
independent variables in the hope that this will perform better. */
const uschar *start_subject = md->start_subject;
const uschar *end_subject = md->end_subject;
const uschar *start_code = md->start_code;
#ifdef SUPPORT_UTF8
BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
#else
BOOL utf8 = FALSE;
#endif
rlevel++;
offsetcount &= (-2);
wscount -= 2;
wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
(2 * INTS_PER_STATEBLOCK);
DPRINTF(("\n%.*s---------------------\n"
"%.*sCall to internal_dfa_exec f=%d r=%d\n",
rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
ctypes = md->tables + ctypes_offset;
lcc = md->tables + lcc_offset;
fcc = md->tables + fcc_offset;
match_count = PCRE_ERROR_NOMATCH; /* A negative number */
active_states = (stateblock *)(workspace + 2);
next_new_state = new_states = active_states + wscount;
new_count = 0;
first_op = this_start_code + 1 + LINK_SIZE +
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
/* The first thing in any (sub) pattern is a bracket of some sort. Push all
the alternative states onto the list, and find out where the end is. This
makes is possible to use this function recursively, when we want to stop at a
matching internal ket rather than at the end.
If the first opcode in the first alternative is OP_REVERSE, we are dealing with
a backward assertion. In that case, we have to find out the maximum amount to
move back, and set up each alternative appropriately. */
if (*first_op == OP_REVERSE)
{
int max_back = 0;
int gone_back;
end_code = this_start_code;
do
{
int back = GET(end_code, 2+LINK_SIZE);
if (back > max_back) max_back = back;
end_code += GET(end_code, 1);
}
while (*end_code == OP_ALT);
/* If we can't go back the amount required for the longest lookbehind
pattern, go back as far as we can; some alternatives may still be viable. */
#ifdef SUPPORT_UTF8
/* In character mode we have to step back character by character */
if (utf8)
{
for (gone_back = 0; gone_back < max_back; gone_back++)
{
if (current_subject <= start_subject) break;
current_subject--;
while (current_subject > start_subject &&
(*current_subject & 0xc0) == 0x80)
current_subject--;
}
}
else
#endif
/* In byte-mode we can do this quickly. */
{
gone_back = (current_subject - max_back < start_subject)?
current_subject - start_subject : max_back;
current_subject -= gone_back;
}
/* Now we can process the individual branches. */
end_code = this_start_code;
do
{
int back = GET(end_code, 2+LINK_SIZE);
if (back <= gone_back)
{
int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
ADD_NEW_DATA(-bstate, 0, gone_back - back);
}
end_code += GET(end_code, 1);
}
while (*end_code == OP_ALT);
}
/* This is the code for a "normal" subpattern (not a backward assertion). The
start of a whole pattern is always one of these. If we are at the top level,
we may be asked to restart matching from the same point that we reached for a
previous partial match. We still have to scan through the top-level branches to
find the end state. */
else
{
end_code = this_start_code;
/* Restarting */
if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
{
do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
new_count = workspace[1];
if (!workspace[0])
memcpy(new_states, active_states, new_count * sizeof(stateblock));
}
/* Not restarting */
else
{
int length = 1 + LINK_SIZE +
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
do
{
ADD_NEW(end_code - start_code + length, 0);
end_code += GET(end_code, 1);
length = 1 + LINK_SIZE;
}
while (*end_code == OP_ALT);
}
}
workspace[0] = 0; /* Bit indicating which vector is current */
DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
/* Loop for scanning the subject */
ptr = current_subject;
for (;;)
{
int i, j;
int clen, dlen;
unsigned int c, d;
/* Make the new state list into the active state list and empty the
new state list. */
temp_states = active_states;
active_states = new_states;
new_states = temp_states;
active_count = new_count;
new_count = 0;
workspace[0] ^= 1; /* Remember for the restarting feature */
workspace[1] = active_count;
#ifdef DEBUG
printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
pchars((uschar *)ptr, strlen((char *)ptr), stdout);
printf("\"\n");
printf("%.*sActive states: ", rlevel*2-2, SP);
for (i = 0; i < active_count; i++)
printf("%d/%d ", active_states[i].offset, active_states[i].count);
printf("\n");
#endif
/* Set the pointers for adding new states */
next_active_state = active_states + active_count;
next_new_state = new_states;
/* Load the current character from the subject outside the loop, as many
different states may want to look at it, and we assume that at least one
will. */
if (ptr < end_subject)
{
clen = 1; /* Number of bytes in the character */
#ifdef SUPPORT_UTF8
if (utf8) { GETCHARLEN(c, ptr, clen); } else
#endif /* SUPPORT_UTF8 */
c = *ptr;
}
else
{
clen = 0; /* This indicates the end of the subject */
c = NOTACHAR; /* This value should never actually be used */
}
/* Scan up the active states and act on each one. The result of an action
may be to add more states to the currently active list (e.g. on hitting a
parenthesis) or it may be to put states on the new list, for considering
when we move the character pointer on. */
for (i = 0; i < active_count; i++)
{
stateblock *current_state = active_states + i;
const uschar *code;
int state_offset = current_state->offset;
int count, codevalue;
#ifdef SUPPORT_UCP
int chartype, script;
#endif
#ifdef DEBUG
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
if (clen == 0) printf("EOL\n");
else if (c > 32 && c < 127) printf("'%c'\n", c);
else printf("0x%02x\n", c);
#endif
/* This variable is referred to implicity in the ADD_xxx macros. */
ims = current_state->ims;
/* A negative offset is a special case meaning "hold off going to this
(negated) state until the number of characters in the data field have
been skipped". */
if (state_offset < 0)
{
if (current_state->data > 0)
{
DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
ADD_NEW_DATA(state_offset, current_state->count,
current_state->data - 1);
continue;
}
else
{
current_state->offset = state_offset = -state_offset;
}
}
/* Check for a duplicate state with the same count, and skip if found. */
for (j = 0; j < i; j++)
{
if (active_states[j].offset == state_offset &&
active_states[j].count == current_state->count)
{
DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
goto NEXT_ACTIVE_STATE;
}
}
/* The state offset is the offset to the opcode */
code = start_code + state_offset;
codevalue = *code;
/* If this opcode is followed by an inline character, load it. It is
tempting to test for the presence of a subject character here, but that
is wrong, because sometimes zero repetitions of the subject are
permitted.
We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
argument that is not a data character - but is always one byte long. We
have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
this case. To keep the other cases fast, convert these ones to new opcodes.
*/
if (coptable[codevalue] > 0)
{
dlen = 1;
#ifdef SUPPORT_UTF8
if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
#endif /* SUPPORT_UTF8 */
d = code[coptable[codevalue]];
if (codevalue >= OP_TYPESTAR)
{
switch(d)
{
case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
case OP_NOTPROP:
case OP_PROP: codevalue += OP_PROP_EXTRA; break;
case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
case OP_NOT_HSPACE:
case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
case OP_NOT_VSPACE:
case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
default: break;
}
}
}
else
{
dlen = 0; /* Not strictly necessary, but compilers moan */
d = NOTACHAR; /* if these variables are not set. */
}
/* Now process the individual opcodes */
switch (codevalue)
{
/* ========================================================================== */
/* Reached a closing bracket. If not at the end of the pattern, carry
on with the next opcode. Otherwise, unless we have an empty string and
PCRE_NOTEMPTY is set, save the match data, shifting up all previous
matches so we always have the longest first. */
case OP_KET:
case OP_KETRMIN:
case OP_KETRMAX:
if (code != end_code)
{
ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
if (codevalue != OP_KET)
{
ADD_ACTIVE(state_offset - GET(code, 1), 0);
}
}
else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
{
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
else if (match_count > 0 && ++match_count * 2 >= offsetcount)
match_count = 0;
count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
if (offsetcount >= 2)
{
offsets[0] = current_subject - start_subject;
offsets[1] = ptr - start_subject;
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
offsets[1] - offsets[0], current_subject));
}
if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
{
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
match_count, rlevel*2-2, SP));
return match_count;
}
}
break;
/* ========================================================================== */
/* These opcodes add to the current list of states without looking
at the current character. */
/*-----------------------------------------------------------------*/
case OP_ALT:
do { code += GET(code, 1); } while (*code == OP_ALT);
ADD_ACTIVE(code - start_code, 0);
break;
/*-----------------------------------------------------------------*/
case OP_BRA:
case OP_SBRA:
do
{
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
code += GET(code, 1);
}
while (*code == OP_ALT);
break;
/*-----------------------------------------------------------------*/
case OP_CBRA:
case OP_SCBRA:
ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
code += GET(code, 1);
while (*code == OP_ALT)
{
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
code += GET(code, 1);
}
break;
/*-----------------------------------------------------------------*/
case OP_BRAZERO:
case OP_BRAMINZERO:
ADD_ACTIVE(state_offset + 1, 0);
code += 1 + GET(code, 2);
while (*code == OP_ALT) code += GET(code, 1);
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
break;
/*-----------------------------------------------------------------*/
case OP_SKIPZERO:
code += 1 + GET(code, 2);
while (*code == OP_ALT) code += GET(code, 1);
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
break;
/*-----------------------------------------------------------------*/
case OP_CIRC:
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
((ims & PCRE_MULTILINE) != 0 &&
ptr != end_subject &&
WAS_NEWLINE(ptr)))
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_EOD:
if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_OPT:
ims = code[1];
ADD_ACTIVE(state_offset + 2, 0);
break;
/*-----------------------------------------------------------------*/
case OP_SOD:
if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_SOM:
if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
break;
/* ========================================================================== */
/* These opcodes inspect the next subject character, and sometimes
the previous one as well, but do not have an argument. The variable
clen contains the length of the current character and is zero if we are
at the end of the subject. */
/*-----------------------------------------------------------------*/
case OP_ANY:
if (clen > 0 && !IS_NEWLINE(ptr))
{ ADD_NEW(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_ALLANY:
if (clen > 0)
{ ADD_NEW(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_EODN:
if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_DOLL:
if ((md->moptions & PCRE_NOTEOL) == 0)
{
if (clen == 0 ||
(IS_NEWLINE(ptr) &&
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
))
{ ADD_ACTIVE(state_offset + 1, 0); }
}
else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_DIGIT:
case OP_WHITESPACE:
case OP_WORDCHAR:
if (clen > 0 && c < 256 &&
((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
{ ADD_NEW(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_NOT_DIGIT:
case OP_NOT_WHITESPACE:
case OP_NOT_WORDCHAR:
if (clen > 0 && (c >= 256 ||
((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
{ ADD_NEW(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_WORD_BOUNDARY:
case OP_NOT_WORD_BOUNDARY:
{
int left_word, right_word;
if (ptr > start_subject)
{
const uschar *temp = ptr - 1;
#ifdef SUPPORT_UTF8
if (utf8) BACKCHAR(temp);
#endif
GETCHARTEST(d, temp);
left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
}
else left_word = 0;
if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
else right_word = 0;
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
{ ADD_ACTIVE(state_offset + 1, 0); }
}
break;
/*-----------------------------------------------------------------*/
/* Check the next character by Unicode property. We will get here only
if the support is in the binary; otherwise a compile-time error occurs.
*/
#ifdef SUPPORT_UCP
case OP_PROP:
case OP_NOTPROP:
if (clen > 0)
{
BOOL OK;
int category = _pcre_ucp_findprop(c, &chartype, &script);
switch(code[1])
{
case PT_ANY:
OK = TRUE;
break;
case PT_LAMP:
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
break;
case PT_GC:
OK = category == code[2];
break;
case PT_PC:
OK = chartype == code[2];
break;
case PT_SC:
OK = script == code[2];
break;
/* Should never occur, but keep compilers from grumbling. */
default:
OK = codevalue != OP_PROP;
break;
}
if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
}
break;
#endif
/* ========================================================================== */
/* These opcodes likewise inspect the subject character, but have an
argument that is not a data character. It is one of these opcodes:
OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEPOSPLUS:
count = current_state->count; /* Already matched */
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
if (clen > 0)
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (count > 0 && codevalue == OP_TYPEPOSPLUS)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
count++;
ADD_NEW(state_offset, count);
}
}
break;
/*-----------------------------------------------------------------*/
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSQUERY:
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0)
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (codevalue == OP_TYPEPOSQUERY)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
ADD_NEW(state_offset + 2, 0);
}
}
break;
/*-----------------------------------------------------------------*/
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPOSSTAR:
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0)
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (codevalue == OP_TYPEPOSSTAR)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
ADD_NEW(state_offset, 0);
}
}
break;
/*-----------------------------------------------------------------*/
case OP_TYPEEXACT:
count = current_state->count; /* Number already matched */
if (clen > 0)
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (++count >= GET2(code, 1))
{ ADD_NEW(state_offset + 4, 0); }
else
{ ADD_NEW(state_offset, count); }
}
}
break;
/*-----------------------------------------------------------------*/
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
ADD_ACTIVE(state_offset + 4, 0);
count = current_state->count; /* Number already matched */
if (clen > 0)
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (codevalue == OP_TYPEPOSUPTO)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
if (++count >= GET2(code, 1))
{ ADD_NEW(state_offset + 4, 0); }
else
{ ADD_NEW(state_offset, count); }
}
}
break;
/* ========================================================================== */
/* These are virtual opcodes that are used when something like
OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
argument. It keeps the code above fast for the other cases. The argument
is in the d variable. */
#ifdef SUPPORT_UCP
case OP_PROP_EXTRA + OP_TYPEPLUS:
case OP_PROP_EXTRA + OP_TYPEMINPLUS:
case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
count = current_state->count; /* Already matched */
if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
if (clen > 0)
{
BOOL OK;
int category = _pcre_ucp_findprop(c, &chartype, &script);
switch(code[2])
{
case PT_ANY:
OK = TRUE;
break;
case PT_LAMP:
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
break;
case PT_GC:
OK = category == code[3];
break;
case PT_PC:
OK = chartype == code[3];
break;
case PT_SC:
OK = script == code[3];
break;
/* Should never occur, but keep compilers from grumbling. */
default:
OK = codevalue != OP_PROP;
break;
}
if (OK == (d == OP_PROP))
{
if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
count++;
ADD_NEW(state_offset, count);
}
}
break;
/*-----------------------------------------------------------------*/
case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
count = current_state->count; /* Already matched */
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
{
const uschar *nptr = ptr + clen;
int ncount = 0;
if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
while (nptr < end_subject)
{
int nd;
int ndlen = 1;
GETCHARLEN(nd, nptr, ndlen);
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
ncount++;
nptr += ndlen;
}
count++;
ADD_NEW_DATA(-state_offset, count, ncount);
}
break;
#endif
/*-----------------------------------------------------------------*/
case OP_ANYNL_EXTRA + OP_TYPEPLUS:
case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
count = current_state->count; /* Already matched */
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
if (clen > 0)
{
int ncount = 0;
switch (c)
{
case 0x000b:
case 0x000c:
case 0x0085:
case 0x2028:
case 0x2029:
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
goto ANYNL01;
case 0x000d:
if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
/* Fall through */
ANYNL01:
case 0x000a:
if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
count++;
ADD_NEW_DATA(-state_offset, count, ncount);
break;
default:
break;
}
}
break;
/*-----------------------------------------------------------------*/
case OP_VSPACE_EXTRA + OP_TYPEPLUS:
case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
count = current_state->count; /* Already matched */
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
if (clen > 0)
{
BOOL OK;
switch (c)
{
case 0x000a:
case 0x000b:
case 0x000c:
case 0x000d:
case 0x0085:
case 0x2028:
case 0x2029:
OK = TRUE;
break;
default:
OK = FALSE;
break;
}
if (OK == (d == OP_VSPACE))
{
if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
count++;
ADD_NEW_DATA(-state_offset, count, 0);
}
}
break;
/*-----------------------------------------------------------------*/
case OP_HSPACE_EXTRA + OP_TYPEPLUS:
case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
count = current_state->count; /* Already matched */
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
if (clen > 0)
{
BOOL OK;
switch (c)
{
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
OK = TRUE;
break;
default:
OK = FALSE;
break;
}
if (OK == (d == OP_HSPACE))
{
if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
count++;
ADD_NEW_DATA(-state_offset, count, 0);
}
}
break;
/*-----------------------------------------------------------------*/
#ifdef SUPPORT_UCP
case OP_PROP_EXTRA + OP_TYPEQUERY:
case OP_PROP_EXTRA + OP_TYPEMINQUERY:
case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
count = 4;
goto QS1;
case OP_PROP_EXTRA + OP_TYPESTAR:
case OP_PROP_EXTRA + OP_TYPEMINSTAR:
case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
count = 0;
QS1:
ADD_ACTIVE(state_offset + 4, 0);
if (clen > 0)
{
BOOL OK;
int category = _pcre_ucp_findprop(c, &chartype, &script);
switch(code[2])
{
case PT_ANY:
OK = TRUE;
break;
case PT_LAMP:
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
break;
case PT_GC:
OK = category == code[3];
break;
case PT_PC:
OK = chartype == code[3];
break;
case PT_SC:
OK = script == code[3];
break;
/* Should never occur, but keep compilers from grumbling. */
default:
OK = codevalue != OP_PROP;
break;
}
if (OK == (d == OP_PROP))
{
if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
ADD_NEW(state_offset + count, 0);
}
}
break;
/*-----------------------------------------------------------------*/
case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
count = 2;
goto QS2;
case OP_EXTUNI_EXTRA + OP_TYPESTAR:
case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
count = 0;
QS2:
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
{
const uschar *nptr = ptr + clen;
int ncount = 0;
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
while (nptr < end_subject)
{
int nd;
int ndlen = 1;
GETCHARLEN(nd, nptr, ndlen);
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
ncount++;
nptr += ndlen;
}
ADD_NEW_DATA(-(state_offset + count), 0, ncount);
}
break;
#endif
/*-----------------------------------------------------------------*/
case OP_ANYNL_EXTRA + OP_TYPEQUERY:
case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
count = 2;
goto QS3;
case OP_ANYNL_EXTRA + OP_TYPESTAR:
case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
count = 0;
QS3:
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0)
{
int ncount = 0;
switch (c)
{
case 0x000b:
case 0x000c:
case 0x0085:
case 0x2028:
case 0x2029:
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
goto ANYNL02;
case 0x000d:
if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
/* Fall through */
ANYNL02:
case 0x000a:
if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
ADD_NEW_DATA(-(state_offset + count), 0, ncount);
break;
default:
break;
}
}
break;
/*-----------------------------------------------------------------*/
case OP_VSPACE_EXTRA + OP_TYPEQUERY:
case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
count = 2;
goto QS4;
case OP_VSPACE_EXTRA + OP_TYPESTAR:
case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
count = 0;
QS4:
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0)
{
BOOL OK;
switch (c)
{
case 0x000a:
case 0x000b:
case 0x000c:
case 0x000d:
case 0x0085:
case 0x2028:
case 0x2029:
OK = TRUE;
break;
default:
OK = FALSE;
break;
}
if (OK == (d == OP_VSPACE))
{
if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
ADD_NEW_DATA(-(state_offset + count), 0, 0);
}
}
break;
/*-----------------------------------------------------------------*/
case OP_HSPACE_EXTRA + OP_TYPEQUERY:
case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
count = 2;
goto QS5;
case OP_HSPACE_EXTRA + OP_TYPESTAR:
case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
count = 0;
QS5:
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0)
{
BOOL OK;
switch (c)
{
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
OK = TRUE;
break;
default:
OK = FALSE;
break;
}
if (OK == (d == OP_HSPACE))
{
if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
ADD_NEW_DATA(-(state_offset + count), 0, 0);
}
}
break;
/*-----------------------------------------------------------------*/
#ifdef SUPPORT_UCP
case OP_PROP_EXTRA + OP_TYPEEXACT:
case OP_PROP_EXTRA + OP_TYPEUPTO:
case OP_PROP_EXTRA + OP_TYPEMINUPTO:
case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
{ ADD_ACTIVE(state_offset + 6, 0); }
count = current_state->count; /* Number already matched */
if (clen > 0)
{
BOOL OK;
int category = _pcre_ucp_findprop(c, &chartype, &script);
switch(code[4])
{
case PT_ANY:
OK = TRUE;
break;
case PT_LAMP:
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
break;
case PT_GC:
OK = category == code[5];
break;
case PT_PC:
OK = chartype == code[5];
break;
case PT_SC:
OK = script == code[5];
break;
/* Should never occur, but keep compilers from grumbling. */
default:
OK = codevalue != OP_PROP;
break;
}
if (OK == (d == OP_PROP))
{
if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
if (++count >= GET2(code, 1))
{ ADD_NEW(state_offset + 6, 0); }
else
{ ADD_NEW(state_offset, count); }
}
}
break;
/*-----------------------------------------------------------------*/
case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
{ ADD_ACTIVE(state_offset + 4, 0); }
count = current_state->count; /* Number already matched */
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
{
const uschar *nptr = ptr + clen;
int ncount = 0;
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
while (nptr < end_subject)
{
int nd;
int ndlen = 1;
GETCHARLEN(nd, nptr, ndlen);
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
ncount++;
nptr += ndlen;
}
if (++count >= GET2(code, 1))
{ ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
else
{ ADD_NEW_DATA(-state_offset, count, ncount); }
}
break;
#endif
/*-----------------------------------------------------------------*/
case OP_ANYNL_EXTRA + OP_TYPEEXACT:
case OP_ANYNL_EXTRA + OP_TYPEUPTO:
case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
{ ADD_ACTIVE(state_offset + 4, 0); }
count = current_state->count; /* Number already matched */
if (clen > 0)
{
int ncount = 0;
switch (c)
{
case 0x000b:
case 0x000c:
case 0x0085:
case 0x2028:
case 0x2029:
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
goto ANYNL03;
case 0x000d:
if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
/* Fall through */
ANYNL03:
case 0x000a:
if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
if (++count >= GET2(code, 1))
{ ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
else
{ ADD_NEW_DATA(-state_offset, count, ncount); }
break;
default:
break;
}
}
break;
/*-----------------------------------------------------------------*/
case OP_VSPACE_EXTRA + OP_TYPEEXACT:
case OP_VSPACE_EXTRA + OP_TYPEUPTO:
case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
{ ADD_ACTIVE(state_offset + 4, 0); }
count = current_state->count; /* Number already matched */
if (clen > 0)
{
BOOL OK;
switch (c)
{
case 0x000a:
case 0x000b:
case 0x000c:
case 0x000d:
case 0x0085:
case 0x2028:
case 0x2029:
OK = TRUE;
break;
default:
OK = FALSE;
}
if (OK == (d == OP_VSPACE))
{
if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
if (++count >= GET2(code, 1))
{ ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
else
{ ADD_NEW_DATA(-state_offset, count, 0); }
}
}
break;
/*-----------------------------------------------------------------*/
case OP_HSPACE_EXTRA + OP_TYPEEXACT:
case OP_HSPACE_EXTRA + OP_TYPEUPTO:
case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
{ ADD_ACTIVE(state_offset + 4, 0); }
count = current_state->count; /* Number already matched */
if (clen > 0)
{
BOOL OK;
switch (c)
{
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
OK = TRUE;
break;
default:
OK = FALSE;
break;
}
if (OK == (d == OP_HSPACE))
{
if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
if (++count >= GET2(code, 1))
{ ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
else
{ ADD_NEW_DATA(-state_offset, count, 0); }
}
}
break;
/* ========================================================================== */
/* These opcodes are followed by a character that is usually compared
to the current subject character; it is loaded into d. We still get
here even if there is no subject character, because in some cases zero
repetitions are permitted. */
/*-----------------------------------------------------------------*/
case OP_CHAR:
if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_CHARNC:
if (clen == 0) break;
#ifdef SUPPORT_UTF8
if (utf8)
{
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
{
unsigned int othercase;
if (c < 128) othercase = fcc[c]; else
/* If we have Unicode property support, we can use it to test the
other case of the character. */
#ifdef SUPPORT_UCP
othercase = _pcre_ucp_othercase(c);
#else
othercase = NOTACHAR;
#endif
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
}
}
else
#endif /* SUPPORT_UTF8 */
/* Non-UTF-8 mode */
{
if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
}
break;
#ifdef SUPPORT_UCP
/*-----------------------------------------------------------------*/
/* This is a tricky one because it can match more than one character.
Find out how many characters to skip, and then set up a negative state
to wait for them to pass before continuing. */
case OP_EXTUNI:
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
{
const uschar *nptr = ptr + clen;
int ncount = 0;
while (nptr < end_subject)
{
int nclen = 1;
GETCHARLEN(c, nptr, nclen);
if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
ncount++;
nptr += nclen;
}
ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
}
break;
#endif
/*-----------------------------------------------------------------*/
/* This is a tricky like EXTUNI because it too can match more than one
character (when CR is followed by LF). In this case, set up a negative
state to wait for one character to pass before continuing. */
case OP_ANYNL:
if (clen > 0) switch(c)
{
case 0x000b:
case 0x000c:
case 0x0085:
case 0x2028:
case 0x2029:
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
case 0x000a:
ADD_NEW(state_offset + 1, 0);
break;
case 0x000d:
if (ptr + 1 < end_subject && ptr[1] == 0x0a)
{
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
}
else
{
ADD_NEW(state_offset + 1, 0);
}
break;
}
break;
/*-----------------------------------------------------------------*/
case OP_NOT_VSPACE:
if (clen > 0) switch(c)
{
case 0x000a:
case 0x000b:
case 0x000c:
case 0x000d:
case 0x0085:
case 0x2028:
case 0x2029:
break;
default:
ADD_NEW(state_offset + 1, 0);
break;
}
break;
/*-----------------------------------------------------------------*/
case OP_VSPACE:
if (clen > 0) switch(c)
{
case 0x000a:
case 0x000b:
case 0x000c:
case 0x000d:
case 0x0085:
case 0x2028:
case 0x2029:
ADD_NEW(state_offset + 1, 0);
break;
default: break;
}
break;
/*-----------------------------------------------------------------*/
case OP_NOT_HSPACE:
if (clen > 0) switch(c)
{
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
break;
default:
ADD_NEW(state_offset + 1, 0);
break;
}
break;
/*-----------------------------------------------------------------*/
case OP_HSPACE:
if (clen > 0) switch(c)
{
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
ADD_NEW(state_offset + 1, 0);
break;
}
break;
/*-----------------------------------------------------------------*/
/* Match a negated single character. This is only used for one-byte
characters, that is, we know that d < 256. The character we are
checking (c) can be multibyte. */
case OP_NOT:
if (clen > 0)
{
unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
}
break;
/*-----------------------------------------------------------------*/
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
case OP_NOTPLUS:
case OP_NOTMINPLUS:
case OP_NOTPOSPLUS:
count = current_state->count; /* Already matched */
if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
if ((ims & PCRE_CASELESS) != 0)
{
#ifdef SUPPORT_UTF8
if (utf8 && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = _pcre_ucp_othercase(d);
#endif /* SUPPORT_UCP */
}
else
#endif /* SUPPORT_UTF8 */
otherd = fcc[d];
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
{
if (count > 0 &&
(codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
count++;
ADD_NEW(state_offset, count);
}
}
break;
/*-----------------------------------------------------------------*/
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
case OP_NOTQUERY:
case OP_NOTMINQUERY:
case OP_NOTPOSQUERY:
ADD_ACTIVE(state_offset + dlen + 1, 0);
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
if ((ims & PCRE_CASELESS) != 0)
{
#ifdef SUPPORT_UTF8
if (utf8 && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = _pcre_ucp_othercase(d);
#endif /* SUPPORT_UCP */
}
else
#endif /* SUPPORT_UTF8 */
otherd = fcc[d];
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
{
if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
ADD_NEW(state_offset + dlen + 1, 0);
}
}
break;
/*-----------------------------------------------------------------*/
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
case OP_NOTSTAR:
case OP_NOTMINSTAR:
case OP_NOTPOSSTAR:
ADD_ACTIVE(state_offset + dlen + 1, 0);
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
if ((ims & PCRE_CASELESS) != 0)
{
#ifdef SUPPORT_UTF8
if (utf8 && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = _pcre_ucp_othercase(d);
#endif /* SUPPORT_UCP */
}
else
#endif /* SUPPORT_UTF8 */
otherd = fcc[d];
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
{
if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
ADD_NEW(state_offset, 0);
}
}
break;
/*-----------------------------------------------------------------*/
case OP_EXACT:
case OP_NOTEXACT:
count = current_state->count; /* Number already matched */
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
if ((ims & PCRE_CASELESS) != 0)
{
#ifdef SUPPORT_UTF8
if (utf8 && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = _pcre_ucp_othercase(d);
#endif /* SUPPORT_UCP */
}
else
#endif /* SUPPORT_UTF8 */
otherd = fcc[d];
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
{
if (++count >= GET2(code, 1))
{ ADD_NEW(state_offset + dlen + 3, 0); }
else
{ ADD_NEW(state_offset, count); }
}
}
break;
/*-----------------------------------------------------------------*/
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
case OP_NOTUPTO:
case OP_NOTMINUPTO:
case OP_NOTPOSUPTO:
ADD_ACTIVE(state_offset + dlen + 3, 0);
count = current_state->count; /* Number already matched */
if (clen > 0)
{
unsigned int otherd = NOTACHAR;
if ((ims & PCRE_CASELESS) != 0)
{
#ifdef SUPPORT_UTF8
if (utf8 && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = _pcre_ucp_othercase(d);
#endif /* SUPPORT_UCP */
}
else
#endif /* SUPPORT_UTF8 */
otherd = fcc[d];
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
{
if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
{
active_count--; /* Remove non-match possibility */
next_active_state--;
}
if (++count >= GET2(code, 1))
{ ADD_NEW(state_offset + dlen + 3, 0); }
else
{ ADD_NEW(state_offset, count); }
}
}
break;
/* ========================================================================== */
/* These are the class-handling opcodes */
case OP_CLASS:
case OP_NCLASS:
case OP_XCLASS:
{
BOOL isinclass = FALSE;
int next_state_offset;
const uschar *ecode;
/* For a simple class, there is always just a 32-byte table, and we
can set isinclass from it. */
if (codevalue != OP_XCLASS)
{
ecode = code + 33;
if (clen > 0)
{
isinclass = (c > 255)? (codevalue == OP_NCLASS) :
((code[1 + c/8] & (1 << (c&7))) != 0);
}
}
/* An extended class may have a table or a list of single characters,
ranges, or both, and it may be positive or negative. There's a
function that sorts all this out. */
else
{
ecode = code + GET(code, 1);
if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
}
/* At this point, isinclass is set for all kinds of class, and ecode
points to the byte after the end of the class. If there is a
quantifier, this is where it will be. */
next_state_offset = ecode - start_code;
switch (*ecode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
ADD_ACTIVE(next_state_offset + 1, 0);
if (isinclass) { ADD_NEW(state_offset, 0); }
break;
case OP_CRPLUS:
case OP_CRMINPLUS:
count = current_state->count; /* Already matched */
if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
if (isinclass) { count++; ADD_NEW(state_offset, count); }
break;
case OP_CRQUERY:
case OP_CRMINQUERY:
ADD_ACTIVE(next_state_offset + 1, 0);
if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
count = current_state->count; /* Already matched */
if (count >= GET2(ecode, 1))
{ ADD_ACTIVE(next_state_offset + 5, 0); }
if (isinclass)
{
int max = GET2(ecode, 3);
if (++count >= max && max != 0) /* Max 0 => no limit */
{ ADD_NEW(next_state_offset + 5, 0); }
else
{ ADD_NEW(state_offset, count); }
}
break;
default:
if (isinclass) { ADD_NEW(next_state_offset, 0); }
break;
}
}
break;
/* ========================================================================== */
/* These are the opcodes for fancy brackets of various kinds. We have
to use recursion in order to handle them. The "always failing" assersion
(?!) is optimised when compiling to OP_FAIL, so we have to support that,
though the other "backtracking verbs" are not supported. */
case OP_FAIL:
break;
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
{
int rc;
int local_offsets[2];
int local_workspace[1000];
const uschar *endasscode = code + GET(code, 1);
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
rc = internal_dfa_exec(
md, /* static match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
ptr - start_subject, /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
ims, /* the current ims flags */
rlevel, /* function recursion level */
recursing); /* pass on regex recursion */
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
}
break;
/*-----------------------------------------------------------------*/
case OP_COND:
case OP_SCOND:
{
int local_offsets[1000];
int local_workspace[1000];
int condcode = code[LINK_SIZE+1];
/* Back reference conditions are not supported */
if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
/* The DEFINE condition is always false */
if (condcode == OP_DEF)
{
ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
}
/* The only supported version of OP_RREF is for the value RREF_ANY,
which means "test if in any recursion". We can't test for specifically
recursed groups. */
else if (condcode == OP_RREF)
{
int value = GET2(code, LINK_SIZE+2);
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
}
/* Otherwise, the condition is an assertion */
else
{
int rc;
const uschar *asscode = code + LINK_SIZE + 1;
const uschar *endasscode = asscode + GET(asscode, 1);
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
rc = internal_dfa_exec(
md, /* fixed match data */
asscode, /* this subexpression's code */
ptr, /* where we currently are */
ptr - start_subject, /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
ims, /* the current ims flags */
rlevel, /* function recursion level */
recursing); /* pass on regex recursion */
if ((rc >= 0) ==
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
else
{ ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
}
}
break;
/*-----------------------------------------------------------------*/
case OP_RECURSE:
{
int local_offsets[1000];
int local_workspace[1000];
int rc;
DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
recursing + 1));
rc = internal_dfa_exec(
md, /* fixed match data */
start_code + GET(code, 1), /* this subexpression's code */
ptr, /* where we currently are */
ptr - start_subject, /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
ims, /* the current ims flags */
rlevel, /* function recursion level */
recursing + 1); /* regex recurse level */
DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
recursing + 1, rc));
/* Ran out of internal offsets */
if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
/* For each successful matched substring, set up the next state with a
count of characters to skip before trying it. Note that the count is in
characters, not bytes. */
if (rc > 0)
{
for (rc = rc*2 - 2; rc >= 0; rc -= 2)
{
const uschar *p = start_subject + local_offsets[rc];
const uschar *pp = start_subject + local_offsets[rc+1];
int charcount = local_offsets[rc+1] - local_offsets[rc];
while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
if (charcount > 0)
{
ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
}
else
{
ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
}
}
}
else if (rc != PCRE_ERROR_NOMATCH) return rc;
}
break;
/*-----------------------------------------------------------------*/
case OP_ONCE:
{
int local_offsets[2];
int local_workspace[1000];
int rc = internal_dfa_exec(
md, /* fixed match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
ptr - start_subject, /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
ims, /* the current ims flags */
rlevel, /* function recursion level */
recursing); /* pass on regex recursion */
if (rc >= 0)
{
const uschar *end_subpattern = code;
int charcount = local_offsets[1] - local_offsets[0];
int next_state_offset, repeat_state_offset;
do { end_subpattern += GET(end_subpattern, 1); }
while (*end_subpattern == OP_ALT);
next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
/* If the end of this subpattern is KETRMAX or KETRMIN, we must
arrange for the repeat state also to be added to the relevant list.
Calculate the offset, or set -1 for no repeat. */
repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
*end_subpattern == OP_KETRMIN)?
end_subpattern - start_code - GET(end_subpattern, 1) : -1;
/* If we have matched an empty string, add the next state at the
current character pointer. This is important so that the duplicate
checking kicks in, which is what breaks infinite loops that match an
empty string. */
if (charcount == 0)
{
ADD_ACTIVE(next_state_offset, 0);
}
/* Optimization: if there are no more active states, and there
are no new states yet set up, then skip over the subject string
right here, to save looping. Otherwise, set up the new state to swing
into action when the end of the substring is reached. */
else if (i + 1 >= active_count && new_count == 0)
{
ptr += charcount;
clen = 0;
ADD_NEW(next_state_offset, 0);
/* If we are adding a repeat state at the new character position,
we must fudge things so that it is the only current state.
Otherwise, it might be a duplicate of one we processed before, and
that would cause it to be skipped. */
if (repeat_state_offset >= 0)
{
next_active_state = active_states;
active_count = 0;
i = -1;
ADD_ACTIVE(repeat_state_offset, 0);
}
}
else
{
const uschar *p = start_subject + local_offsets[0];
const uschar *pp = start_subject + local_offsets[1];
while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
if (repeat_state_offset >= 0)
{ ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
}
}
else if (rc != PCRE_ERROR_NOMATCH) return rc;
}
break;
/* ========================================================================== */
/* Handle callouts */
case OP_CALLOUT:
if (pcre_callout != NULL)
{
int rrc;
pcre_callout_block cb;
cb.version = 1; /* Version 1 of the callout block */
cb.callout_number = code[1];
cb.offset_vector = offsets;
cb.subject = (PCRE_SPTR)start_subject;
cb.subject_length = end_subject - start_subject;
cb.start_match = current_subject - start_subject;
cb.current_position = ptr - start_subject;
cb.pattern_position = GET(code, 2);
cb.next_item_length = GET(code, 2 + LINK_SIZE);
cb.capture_top = 1;
cb.capture_last = -1;
cb.callout_data = md->callout_data;
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
}
break;
/* ========================================================================== */
default: /* Unsupported opcode */
return PCRE_ERROR_DFA_UITEM;
}
NEXT_ACTIVE_STATE: continue;
} /* End of loop scanning active states */
/* We have finished the processing at the current subject character. If no
new states have been set for the next character, we have found all the
matches that we are going to find. If we are at the top level and partial
matching has been requested, check for appropriate conditions. */
if (new_count <= 0)
{
if (match_count < 0 && /* No matches found */
rlevel == 1 && /* Top level match function */
(md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
ptr >= end_subject && /* Reached end of subject */
ptr > current_subject) /* Matched non-empty string */
{
if (offsetcount >= 2)
{
offsets[0] = current_subject - start_subject;
offsets[1] = end_subject - start_subject;
}
match_count = PCRE_ERROR_PARTIAL;
}
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
rlevel*2-2, SP));
break; /* In effect, "return", but see the comment below */
}
/* One or more states are active for the next character. */
ptr += clen; /* Advance to next subject character */
} /* Loop to move along the subject string */
/* Control gets here from "break" a few lines above. We do it this way because
if we use "return" above, we have compiler trouble. Some compilers warn if
there's nothing here because they think the function doesn't return a value. On
the other hand, if we put a dummy statement here, some more clever compilers
complain that it can't be reached. Sigh. */
return match_count;
}
/*************************************************
* Execute a Regular Expression - DFA engine *
*************************************************/
| pcredfa.c | 282 |
PCRE_EXP_DEFN INT | pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount, int *workspace, int wscount)
PCRE_EXP_DEFN int
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
const char *subject, int length, int start_offset, int options, int *offsets,
int offsetcount, int *workspace, int wscount)
{
real_pcre *re = (real_pcre *)argument_re;
dfa_match_data match_block;
dfa_match_data *md = &match_block;
BOOL utf8, anchored, startline, firstline;
const uschar *current_subject, *end_subject, *lcc;
pcre_study_data internal_study;
const pcre_study_data *study = NULL;
real_pcre internal_re;
const uschar *req_byte_ptr;
const uschar *start_bits = NULL;
BOOL first_byte_caseless = FALSE;
BOOL req_byte_caseless = FALSE;
int first_byte = -1;
int req_byte = -1;
int req_byte2 = -1;
int newline;
/* Plausibility checks */
if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
if (re == NULL || subject == NULL || workspace == NULL ||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
/* We need to find the pointer to any study data before we test for byte
flipping, so we scan the extra_data block first. This may set two fields in the
match block, so we must initialize them beforehand. However, the other fields
in the match block must not be set until after the byte flipping. */
md->tables = re->tables;
md->callout_data = NULL;
if (extra_data != NULL)
{
unsigned int flags = extra_data->flags;
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
study = (const pcre_study_data *)extra_data->study_data;
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
return PCRE_ERROR_DFA_UMLIMIT;
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
md->callout_data = extra_data->callout_data;
if ((flags & PCRE_EXTRA_TABLES) != 0)
md->tables = extra_data->tables;
}
/* Check that the first field in the block is the magic number. If it is not,
test for a regex that was compiled on a host of opposite endianness. If this is
the case, flipped values are put in internal_re and internal_study if there was
study data too. */
if (re->magic_number != MAGIC_NUMBER)
{
re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
if (re == NULL) return PCRE_ERROR_BADMAGIC;
if (study != NULL) study = &internal_study;
}
/* Set some local values */
current_subject = (const unsigned char *)subject + start_offset;
end_subject = (const unsigned char *)subject + length;
req_byte_ptr = current_subject - 1;
#ifdef SUPPORT_UTF8
utf8 = (re->options & PCRE_UTF8) != 0;
#else
utf8 = FALSE;
#endif
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
(re->options & PCRE_ANCHORED) != 0;
/* The remaining fixed data for passing around. */
md->start_code = (const uschar *)argument_re +
re->name_table_offset + re->name_count * re->name_entry_size;
md->start_subject = (const unsigned char *)subject;
md->end_subject = end_subject;
md->moptions = options;
md->poptions = re->options;
/* If the BSR option is not set at match time, copy what was set
at compile time. */
if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
{
if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
#ifdef BSR_ANYCRLF
else md->moptions |= PCRE_BSR_ANYCRLF;
#endif
}
/* Handle different types of newline. The three bits give eight cases. If
nothing is set at run time, whatever was used at compile time applies. */
switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
PCRE_NEWLINE_BITS)
{
case 0: newline = NEWLINE; break; /* Compile-time default */
case PCRE_NEWLINE_CR: newline = '\r'; break;
case PCRE_NEWLINE_LF: newline = '\n'; break;
case PCRE_NEWLINE_CR+
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
case PCRE_NEWLINE_ANY: newline = -1; break;
case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
default: return PCRE_ERROR_BADNEWLINE;
}
if (newline == -2)
{
md->nltype = NLTYPE_ANYCRLF;
}
else if (newline < 0)
{
md->nltype = NLTYPE_ANY;
}
else
{
md->nltype = NLTYPE_FIXED;
if (newline > 255)
{
md->nllen = 2;
md->nl[0] = (newline >> 8) & 255;
md->nl[1] = newline & 255;
}
else
{
md->nllen = 1;
md->nl[0] = newline;
}
}
/* Check a UTF-8 string if required. Unfortunately there's no way of passing
back the character offset. */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
return PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
{
int tb = ((uschar *)subject)[start_offset];
if (tb > 127)
{
tb &= 0xc0;
if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
}
}
}
#endif
/* If the exec call supplied NULL for tables, use the inbuilt ones. This
is a feature that makes it possible to save compiled regex and re-use them
in other programs later. */
if (md->tables == NULL) md->tables = _pcre_default_tables;
/* The lower casing table and the "must be at the start of a line" flag are
used in a loop when finding where to start. */
lcc = md->tables + lcc_offset;
startline = (re->flags & PCRE_STARTLINE) != 0;
firstline = (re->options & PCRE_FIRSTLINE) != 0;
/* Set up the first character to match, if available. The first_byte value is
never set for an anchored regular expression, but the anchoring may be forced
at run time, so we have to test for anchoring. The first char may be unset for
an unanchored pattern, of course. If there's no first char and the pattern was
studied, there may be a bitmap of possible first characters. */
if (!anchored)
{
if ((re->flags & PCRE_FIRSTSET) != 0)
{
first_byte = re->first_byte & 255;
if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
first_byte = lcc[first_byte];
}
else
{
if (startline && study != NULL &&
(study->options & PCRE_STUDY_MAPPED) != 0)
start_bits = study->start_bits;
}
}
/* For anchored or unanchored matches, there may be a "last known required
character" set. */
if ((re->flags & PCRE_REQCHSET) != 0)
{
req_byte = re->req_byte & 255;
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
}
/* Call the main matching function, looping for a non-anchored regex after a
failed match. Unless restarting, optimize by moving to the first match
character if possible, when not anchored. Then unless wanting a partial match,
check for a required later character. */
for (;;)
{
int rc;
if ((options & PCRE_DFA_RESTART) == 0)
{
const uschar *save_end_subject = end_subject;
/* Advance to a unique first char if possible. If firstline is TRUE, the
start of the match is constrained to the first line of a multiline string.
Implement this by temporarily adjusting end_subject so that we stop
scanning at a newline. If the match fails at the newline, later code breaks
this loop. */
if (firstline)
{
const uschar *t = current_subject;
while (t < md->end_subject && !IS_NEWLINE(t)) t++;
end_subject = t;
}
if (first_byte >= 0)
{
if (first_byte_caseless)
while (current_subject < end_subject &&
lcc[*current_subject] != first_byte)
current_subject++;
else
while (current_subject < end_subject && *current_subject != first_byte)
current_subject++;
}
/* Or to just after a linebreak for a multiline match if possible */
else if (startline)
{
if (current_subject > md->start_subject + start_offset)
{
while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
current_subject++;
/* If we have just passed a CR and the newline option is ANY or
ANYCRLF, and we are now at a LF, advance the match position by one more
character. */
if (current_subject[-1] == '\r' &&
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
current_subject < end_subject &&
*current_subject == '\n')
current_subject++;
}
}
/* Or to a non-unique first char after study */
else if (start_bits != NULL)
{
while (current_subject < end_subject)
{
register unsigned int c = *current_subject;
if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
else break;
}
}
/* Restore fudged end_subject */
end_subject = save_end_subject;
}
/* If req_byte is set, we know that that character must appear in the subject
for the match to succeed. If the first character is set, req_byte must be
later in the subject; otherwise the test starts at the match point. This
optimization can save a huge amount of work in patterns with nested unlimited
repeats that aren't going to match. Writing separate code for cased/caseless
versions makes it go faster, as does using an autoincrement and backing off
on a match.
HOWEVER: when the subject string is very, very long, searching to its end can
take a long time, and give bad performance on quite ordinary patterns. This
showed up when somebody was matching /^C/ on a 32-megabyte string... so we
don't do this when the string is sufficiently long.
ALSO: this processing is disabled when partial matching is requested.
*/
if (req_byte >= 0 &&
end_subject - current_subject < REQ_BYTE_MAX &&
(options & PCRE_PARTIAL) == 0)
{
register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
/* We don't need to repeat the search if we haven't yet reached the
place we found it at last time. */
if (p > req_byte_ptr)
{
if (req_byte_caseless)
{
while (p < end_subject)
{
register int pp = *p++;
if (pp == req_byte || pp == req_byte2) { p--; break; }
}
}
else
{
while (p < end_subject)
{
if (*p++ == req_byte) { p--; break; }
}
}
/* If we can't find the required character, break the matching loop,
which will cause a return or PCRE_ERROR_NOMATCH. */
if (p >= end_subject) break;
/* If we have found the required character, save the point where we
found it, so that we don't search again next time round the loop if
the start hasn't passed this character yet. */
req_byte_ptr = p;
}
}
/* OK, now we can do the business */
rc = internal_dfa_exec(
md, /* fixed match data */
md->start_code, /* this subexpression's code */
current_subject, /* where we currently are */
start_offset, /* start offset in subject */
offsets, /* offset vector */
offsetcount, /* size of same */
workspace, /* workspace vector */
wscount, /* size of same */
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
0, /* function recurse level */
0); /* regex recurse level */
/* Anything other than "no match" means we are done, always; otherwise, carry
on only if not anchored. */
if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
/* Advance to the next subject character unless we are at the end of a line
and firstline is set. */
if (firstline && IS_NEWLINE(current_subject)) break;
current_subject++;
if (utf8)
{
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
current_subject++;
}
if (current_subject > end_subject) break;
/* If we have just passed a CR and we are now at a LF, and the pattern does
not contain any explicit matches for \r or \n, and the newline option is CRLF
or ANY or ANYCRLF, advance the match position by one more character. */
if (current_subject[-1] == '\r' &&
current_subject < end_subject &&
*current_subject == '\n' &&
(re->flags & PCRE_HASCRORLF) == 0 &&
(md->nltype == NLTYPE_ANY ||
md->nltype == NLTYPE_ANYCRLF ||
md->nllen == 2))
current_subject++;
} /* "Bumpalong" loop */
return PCRE_ERROR_NOMATCH;
}
| pcredfa.c | 2511 |
pcreexec.c |
Type | Function | Source | Line |
STATIC VOID | pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
static void
pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
{
unsigned int c;
if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
while (length-- > 0)
if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
}
#endif
/*************************************************
* Match a back-reference *
*************************************************/
| pcreexec.c | 109 |
STATIC BOOL | match_ref(int offset, register USPTR eptr, int length, match_data *md, unsigned long int ims)
static BOOL
match_ref(int offset, register USPTR eptr, int length, match_data *md,
unsigned long int ims)
{
USPTR p = md->start_subject + md->offset_vector[offset];
#ifdef DEBUG
if (eptr >= md->end_subject)
printf("matching subject ");
else
{
printf("matching subject ");
pchars(eptr, length, TRUE, md);
}
printf(" against backref ");
pchars(p, length, FALSE, md);
printf("\n");
#endif
/* Always fail if not enough characters left */
if (length > md->end_subject - eptr) return FALSE;
/* Separate the caselesss case for speed */
if ((ims & PCRE_CASELESS) != 0)
{
while (length-- > 0)
if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
}
else
{ while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
return TRUE;
}
/***************************************************************************
****************************************************************************
RECURSION IN THE match() FUNCTION
The match() function is highly recursive, though not every recursive call
increases the recursive depth. Nevertheless, some regular expressions can cause
it to recurse to a great depth. I was writing for Unix, so I just let it call
itself recursively. This uses the stack for saving everything that has to be
saved for a recursive call. On Unix, the stack can be large, and this works
fine.
It turns out that on some non-Unix-like systems there are problems with
programs that use a lot of stack. (This despite the fact that every last chip
has oodles of memory these days, and techniques for extending the stack have
been known for decades.) So....
There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
calls by keeping local variables that need to be preserved in blocks of memory
obtained from malloc() instead instead of on the stack. Macros are used to
achieve this so that the actual code doesn't look very different to what it
always used to.
The original heap-recursive code used longjmp(). However, it seems that this
can be very slow on some operating systems. Following a suggestion from Stan
Switzer, the use of longjmp() has been abolished, at the cost of having to
provide a unique number for each call to RMATCH. There is no way of generating
a sequence of numbers at compile time in C. I have given them names, to make
them stand out more clearly.
Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
tests. Furthermore, not using longjmp() means that local dynamic variables
don't have indeterminate values; this has meant that the frame size can be
reduced because the result can be "passed back" by straight setting of the
variable instead of being passed in the frame.
****************************************************************************
***************************************************************************/
/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
below must be updated in sync. */
enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
RM51, RM52, RM53, RM54 };
/* These versions of the macros use the stack, as normal. There are debugging
versions and production versions. Note that the "rw" argument of RMATCH isn't
actuall used in this definition. */
#ifndef NO_RECURSE
#define REGISTER register
#ifdef DEBUG
#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
{ \
printf("match() called in line %d\n", __LINE__); \
rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
printf("to line %d\n", __LINE__); \
}
#define RRETURN(ra) \
{ \
printf("match() returned %d from line %d ", ra, __LINE__); \
return ra; \
}
#else
#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
#define RRETURN(ra) return ra
#endif
#else
/* These versions of the macros manage a private stack on the heap. Note that
the "rd" argument of RMATCH isn't actually used in this definition. It's the md
argument of match(), which never changes. */
#define REGISTER
#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
{\
heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
frame->Xwhere = rw; \
newframe->Xeptr = ra;\
newframe->Xecode = rb;\
newframe->Xmstart = mstart;\
newframe->Xoffset_top = rc;\
newframe->Xims = re;\
newframe->Xeptrb = rf;\
newframe->Xflags = rg;\
newframe->Xrdepth = frame->Xrdepth + 1;\
newframe->Xprevframe = frame;\
frame = newframe;\
DPRINTF(("restarting from line %d\n", __LINE__));\
goto HEAP_RECURSE;\
L_##rw:\
DPRINTF(("jumped back to line %d\n", __LINE__));\
}
#define RRETURN(ra)\
{\
heapframe *newframe = frame;\
frame = newframe->Xprevframe;\
(pcre_stack_free)(newframe);\
if (frame != NULL)\
{\
rrc = ra;\
goto HEAP_RETURN;\
}\
return ra;\
}
/* Structure for remembering the local variables in a private frame */
typedef struct heapframe {
struct heapframe *Xprevframe;
/* Function arguments that may change */
const uschar *Xeptr;
const uschar *Xecode;
const uschar *Xmstart;
int Xoffset_top;
long int Xims;
eptrblock *Xeptrb;
int Xflags;
unsigned int Xrdepth;
/* Function local variables */
const uschar *Xcallpat;
const uschar *Xcharptr;
const uschar *Xdata;
const uschar *Xnext;
const uschar *Xpp;
const uschar *Xprev;
const uschar *Xsaved_eptr;
recursion_info Xnew_recursive;
BOOL Xcur_is_word;
BOOL Xcondition;
BOOL Xprev_is_word;
unsigned long int Xoriginal_ims;
#ifdef SUPPORT_UCP
int Xprop_type;
int Xprop_value;
int Xprop_fail_result;
int Xprop_category;
int Xprop_chartype;
int Xprop_script;
int Xoclength;
uschar Xocchars[8];
#endif
int Xctype;
unsigned int Xfc;
int Xfi;
int Xlength;
int Xmax;
int Xmin;
int Xnumber;
int Xoffset;
int Xop;
int Xsave_capture_last;
int Xsave_offset1, Xsave_offset2, Xsave_offset3;
int Xstacksave[REC_STACK_SAVE_MAX];
eptrblock Xnewptrb;
/* Where to jump back to */
int Xwhere;
} heapframe;
#endif
/***************************************************************************
***************************************************************************/
/*************************************************
* Match from current position *
*************************************************/
| pcreexec.c | 138 |
STATIC INT | match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags, unsigned int rdepth)
static int
match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
int flags, unsigned int rdepth)
{
/* These variables do not need to be preserved over recursion in this function,
so they can be ordinary variables in all cases. Mark some of them with
"register" because they are used a lot in loops. */
register int rrc; /* Returns from recursive calls */
register int i; /* Used for loops not involving calls to RMATCH() */
register unsigned int c; /* Character values not kept over RMATCH() calls */
register BOOL utf8; /* Local copy of UTF-8 flag for speed */
BOOL minimize, possessive; /* Quantifier options */
/* When recursion is not being used, all "local" variables that have to be
preserved over calls to RMATCH() are part of a "frame" which is obtained from
heap storage. Set up the top-level frame here; others are obtained from the
heap whenever RMATCH() does a "recursion". See the macro definitions above. */
#ifdef NO_RECURSE
heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
frame->Xprevframe = NULL; /* Marks the top level */
/* Copy in the original argument variables */
frame->Xeptr = eptr;
frame->Xecode = ecode;
frame->Xmstart = mstart;
frame->Xoffset_top = offset_top;
frame->Xims = ims;
frame->Xeptrb = eptrb;
frame->Xflags = flags;
frame->Xrdepth = rdepth;
/* This is where control jumps back to to effect "recursion" */
HEAP_RECURSE:
/* Macros make the argument variables come from the current frame */
#define eptr frame->Xeptr
#define ecode frame->Xecode
#define mstart frame->Xmstart
#define offset_top frame->Xoffset_top
#define ims frame->Xims
#define eptrb frame->Xeptrb
#define flags frame->Xflags
#define rdepth frame->Xrdepth
/* Ditto for the local variables */
#ifdef SUPPORT_UTF8
#define charptr frame->Xcharptr
#endif
#define callpat frame->Xcallpat
#define data frame->Xdata
#define next frame->Xnext
#define pp frame->Xpp
#define prev frame->Xprev
#define saved_eptr frame->Xsaved_eptr
#define new_recursive frame->Xnew_recursive
#define cur_is_word frame->Xcur_is_word
#define condition frame->Xcondition
#define prev_is_word frame->Xprev_is_word
#define original_ims frame->Xoriginal_ims
#ifdef SUPPORT_UCP
#define prop_type frame->Xprop_type
#define prop_value frame->Xprop_value
#define prop_fail_result frame->Xprop_fail_result
#define prop_category frame->Xprop_category
#define prop_chartype frame->Xprop_chartype
#define prop_script frame->Xprop_script
#define oclength frame->Xoclength
#define occhars frame->Xocchars
#endif
#define ctype frame->Xctype
#define fc frame->Xfc
#define fi frame->Xfi
#define length frame->Xlength
#define max frame->Xmax
#define min frame->Xmin
#define number frame->Xnumber
#define offset frame->Xoffset
#define op frame->Xop
#define save_capture_last frame->Xsave_capture_last
#define save_offset1 frame->Xsave_offset1
#define save_offset2 frame->Xsave_offset2
#define save_offset3 frame->Xsave_offset3
#define stacksave frame->Xstacksave
#define newptrb frame->Xnewptrb
/* When recursion is being used, local variables are allocated on the stack and
get preserved during recursion in the normal way. In this environment, fi and
i, and fc and c, can be the same variables. */
#else /* NO_RECURSE not defined */
#define fi i
#define fc c
#ifdef SUPPORT_UTF8 /* Many of these variables are used only */
const uschar *charptr; /* in small blocks of the code. My normal */
#endif /* style of coding would have declared */
const uschar *callpat; /* them within each of those blocks. */
const uschar *data; /* However, in order to accommodate the */
const uschar *next; /* version of this code that uses an */
USPTR pp; /* external "stack" implemented on the */
const uschar *prev; /* heap, it is easier to declare them all */
USPTR saved_eptr; /* here, so the declarations can be cut */
/* out in a block. The only declarations */
recursion_info new_recursive; /* within blocks below are for variables */
/* that do not have to be preserved over */
BOOL cur_is_word; /* a recursive call to RMATCH(). */
BOOL condition;
BOOL prev_is_word;
unsigned long int original_ims;
#ifdef SUPPORT_UCP
int prop_type;
int prop_value;
int prop_fail_result;
int prop_category;
int prop_chartype;
int prop_script;
int oclength;
uschar occhars[8];
#endif
int ctype;
int length;
int max;
int min;
int number;
int offset;
int op;
int save_capture_last;
int save_offset1, save_offset2, save_offset3;
int stacksave[REC_STACK_SAVE_MAX];
eptrblock newptrb;
#endif /* NO_RECURSE */
/* These statements are here to stop the compiler complaining about unitialized
variables. */
#ifdef SUPPORT_UCP
prop_value = 0;
prop_fail_result = 0;
#endif
/* This label is used for tail recursion, which is used in a few cases even
when NO_RECURSE is not defined, in order to reduce the amount of stack that is
used. Thanks to Ian Taylor for noticing this possibility and sending the
original patch. */
TAIL_RECURSE:
/* OK, now we can get on with the real code of the function. Recursive calls
are specified by the macro RMATCH and RRETURN is used to return. When
NO_RECURSE is *not* defined, these just turn into a recursive call to match()
and a "return", respectively (possibly with some debugging if DEBUG is
defined). However, RMATCH isn't like a function call because it's quite a
complicated macro. It has to be used in one particular way. This shouldn't,
however, impact performance when true recursion is being used. */
#ifdef SUPPORT_UTF8
utf8 = md->utf8; /* Local copy of the flag */
#else
utf8 = FALSE;
#endif
/* First check that we haven't called match() too many times, or that we
haven't exceeded the recursive call limit. */
if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
original_ims = ims; /* Save for resetting on ')' */
/* At the start of a group with an unlimited repeat that may match an empty
string, the match_cbegroup flag is set. When this is the case, add the current
subject pointer to the chain of such remembered pointers, to be checked when we
hit the closing ket, in order to break infinite loops that match no characters.
When match() is called in other circumstances, don't add to the chain. The
match_cbegroup flag must NOT be used with tail recursion, because the memory
block that is used is on the stack, so a new one may be required for each
match(). */
if ((flags & match_cbegroup) != 0)
{
newptrb.epb_saved_eptr = eptr;
newptrb.epb_prev = eptrb;
eptrb = &newptrb;
}
/* Now start processing the opcodes. */
for (;;)
{
minimize = possessive = FALSE;
op = *ecode;
/* For partial matching, remember if we ever hit the end of the subject after
matching at least one subject character. */
if (md->partial &&
eptr >= md->end_subject &&
eptr > mstart)
md->hitend = TRUE;
switch(op)
{
case OP_FAIL:
RRETURN(MATCH_NOMATCH);
case OP_PRUNE:
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM51);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RRETURN(MATCH_PRUNE);
case OP_COMMIT:
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM52);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RRETURN(MATCH_COMMIT);
case OP_SKIP:
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM53);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
md->start_match_ptr = eptr; /* Pass back current position */
RRETURN(MATCH_SKIP);
case OP_THEN:
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM54);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RRETURN(MATCH_THEN);
/* Handle a capturing bracket. If there is space in the offset vector, save
the current subject position in the working slot at the top of the vector.
We mustn't change the current values of the data slot, because they may be
set from a previous iteration of this group, and be referred to by a
reference inside the group.
If the bracket fails to match, we need to restore this value and also the
values of the final offsets, in case they were set by a previous iteration
of the same bracket.
If there isn't enough space in the offset vector, treat this as if it were
a non-capturing bracket. Don't worry about setting the flag for the error
case here; that is handled in the code for KET. */
case OP_CBRA:
case OP_SCBRA:
number = GET2(ecode, 1+LINK_SIZE);
offset = number << 1;
#ifdef DEBUG
printf("start bracket %d\n", number);
printf("subject=");
pchars(eptr, 16, TRUE, md);
printf("\n");
#endif
if (offset < md->offset_max)
{
save_offset1 = md->offset_vector[offset];
save_offset2 = md->offset_vector[offset+1];
save_offset3 = md->offset_vector[md->offset_end - number];
save_capture_last = md->capture_last;
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
flags = (op == OP_SCBRA)? match_cbegroup : 0;
do
{
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM1);
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
md->capture_last = save_capture_last;
ecode += GET(ecode, 1);
}
while (*ecode == OP_ALT);
DPRINTF(("bracket %d failed\n", number));
md->offset_vector[offset] = save_offset1;
md->offset_vector[offset+1] = save_offset2;
md->offset_vector[md->offset_end - number] = save_offset3;
RRETURN(MATCH_NOMATCH);
}
/* FALL THROUGH ... Insufficient room for saving captured contents. Treat
as a non-capturing bracket. */
/* VVVVVVVVVVVVVVVVVVVVVVVVV */
/* VVVVVVVVVVVVVVVVVVVVVVVVV */
DPRINTF(("insufficient capture room: treat as non-capturing\n"));
/* VVVVVVVVVVVVVVVVVVVVVVVVV */
/* VVVVVVVVVVVVVVVVVVVVVVVVV */
/* Non-capturing bracket. Loop for all the alternatives. When we get to the
final alternative within the brackets, we would return the result of a
recursive call to match() whatever happened. We can reduce stack usage by
turning this into a tail recursion, except in the case when match_cbegroup
is set.*/
case OP_BRA:
case OP_SBRA:
DPRINTF(("start non-capturing bracket\n"));
flags = (op >= OP_SBRA)? match_cbegroup : 0;
for (;;)
{
if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
{
if (flags == 0) /* Not a possibly empty group */
{
ecode += _pcre_OP_lengths[*ecode];
DPRINTF(("bracket 0 tail recursion\n"));
goto TAIL_RECURSE;
}
/* Possibly empty group; can't use tail recursion. */
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
eptrb, flags, RM48);
RRETURN(rrc);
}
/* For non-final alternatives, continue the loop for a NOMATCH result;
otherwise return. */
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
eptrb, flags, RM2);
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
ecode += GET(ecode, 1);
}
/* Control never reaches here. */
/* Conditional group: compilation checked that there are no more than
two branches. If the condition is false, skipping the first branch takes us
past the end if there is only one branch, but that's OK because that is
exactly what going to the ket would do. As there is only one branch to be
obeyed, we can use tail recursion to avoid using another stack frame. */
case OP_COND:
case OP_SCOND:
if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
{
offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
condition = md->recursive != NULL &&
(offset == RREF_ANY || offset == md->recursive->group_num);
ecode += condition? 3 : GET(ecode, 1);
}
else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
{
offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
condition = offset < offset_top && md->offset_vector[offset] >= 0;
ecode += condition? 3 : GET(ecode, 1);
}
else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
{
condition = FALSE;
ecode += GET(ecode, 1);
}
/* The condition is an assertion. Call match() to evaluate it - setting
the final argument match_condassert causes it to stop at the end of an
assertion. */
else
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
match_condassert, RM3);
if (rrc == MATCH_MATCH)
{
condition = TRUE;
ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
while (*ecode == OP_ALT) ecode += GET(ecode, 1);
}
else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
{
RRETURN(rrc); /* Need braces because of following else */
}
else
{
condition = FALSE;
ecode += GET(ecode, 1);
}
}
/* We are now at the branch that is to be obeyed. As there is only one,
we can use tail recursion to avoid using another stack frame, except when
match_cbegroup is required for an unlimited repeat of a possibly empty
group. If the second alternative doesn't exist, we can just plough on. */
if (condition || *ecode == OP_ALT)
{
ecode += 1 + LINK_SIZE;
if (op == OP_SCOND) /* Possibly empty group */
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
RRETURN(rrc);
}
else /* Group must match something */
{
flags = 0;
goto TAIL_RECURSE;
}
}
else /* Condition false & no 2nd alternative */
{
ecode += 1 + LINK_SIZE;
}
break;
/* End of the pattern, either real or forced. If we are in a top-level
recursion, we should restore the offsets appropriately and continue from
after the call. */
case OP_ACCEPT:
case OP_END:
if (md->recursive != NULL && md->recursive->group_num == 0)
{
recursion_info *rec = md->recursive;
DPRINTF(("End of pattern in a (?0) recursion\n"));
md->recursive = rec->prevrec;
memmove(md->offset_vector, rec->offset_save,
rec->saved_max * sizeof(int));
mstart = rec->save_start;
ims = original_ims;
ecode = rec->after_call;
break;
}
/* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
string - backtracking will then try other alternatives, if any. */
if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
md->end_match_ptr = eptr; /* Record where we ended */
md->end_offset_top = offset_top; /* and how many extracts were taken */
md->start_match_ptr = mstart; /* and the start (\K can modify) */
RRETURN(MATCH_MATCH);
/* Change option settings */
case OP_OPT:
ims = ecode[1];
ecode += 2;
DPRINTF(("ims set to %02lx\n", ims));
break;
/* Assertion brackets. Check the alternative branches in turn - the
matching won't pass the KET for an assertion. If any one branch matches,
the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
start of each branch to move the current point backwards, so the code at
this level is identical to the lookahead case. */
case OP_ASSERT:
case OP_ASSERTBACK:
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
RM4);
if (rrc == MATCH_MATCH) break;
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
ecode += GET(ecode, 1);
}
while (*ecode == OP_ALT);
if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
/* If checking an assertion for a condition, return MATCH_MATCH. */
if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
/* Continue from after the assertion, updating the offsets high water
mark, since extracts may have been taken during the assertion. */
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
ecode += 1 + LINK_SIZE;
offset_top = md->end_offset_top;
continue;
/* Negative assertion: all branches must fail to match */
case OP_ASSERT_NOT:
case OP_ASSERTBACK_NOT:
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
RM5);
if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
ecode += 1 + LINK_SIZE;
continue;
/* Move the subject pointer back. This occurs only at the start of
each branch of a lookbehind assertion. If we are too close to the start to
move back, this match function fails. When working with UTF-8 we move
back a number of characters, not bytes. */
case OP_REVERSE:
#ifdef SUPPORT_UTF8
if (utf8)
{
i = GET(ecode, 1);
while (i-- > 0)
{
eptr--;
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
BACKCHAR(eptr);
}
}
else
#endif
/* No UTF-8 support, or not in UTF-8 mode: count is byte count */
{
eptr -= GET(ecode, 1);
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
}
/* Skip to next op code */
ecode += 1 + LINK_SIZE;
break;
/* The callout item calls an external function, if one is provided, passing
details of the match so far. This is mainly for debugging, though the
function is able to force a failure. */
case OP_CALLOUT:
if (pcre_callout != NULL)
{
pcre_callout_block cb;
cb.version = 1; /* Version 1 of the callout block */
cb.callout_number = ecode[1];
cb.offset_vector = md->offset_vector;
cb.subject = (PCRE_SPTR)md->start_subject;
cb.subject_length = md->end_subject - md->start_subject;
cb.start_match = mstart - md->start_subject;
cb.current_position = eptr - md->start_subject;
cb.pattern_position = GET(ecode, 2);
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
cb.capture_top = offset_top/2;
cb.capture_last = md->capture_last;
cb.callout_data = md->callout_data;
if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
}
ecode += 2 + 2*LINK_SIZE;
break;
/* Recursion either matches the current regex, or some subexpression. The
offset data is the offset to the starting bracket from the start of the
whole pattern. (This is so that it works from duplicated subpatterns.)
If there are any capturing brackets started but not finished, we have to
save their starting points and reinstate them after the recursion. However,
we don't know how many such there are (offset_top records the completed
total) so we just have to save all the potential data. There may be up to
65535 such values, which is too large to put on the stack, but using malloc
for small numbers seems expensive. As a compromise, the stack is used when
there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
is used. A problem is what to do if the malloc fails ... there is no way of
returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
values on the stack, and accept that the rest may be wrong.
There are also other values that have to be saved. We use a chained
sequence of blocks that actually live on the stack. Thanks to Robin Houston
for the original version of this logic. */
case OP_RECURSE:
{
callpat = md->start_code + GET(ecode, 1);
new_recursive.group_num = (callpat == md->start_code)? 0 :
GET2(callpat, 1 + LINK_SIZE);
/* Add to "recursing stack" */
new_recursive.prevrec = md->recursive;
md->recursive = &new_recursive;
/* Find where to continue from afterwards */
ecode += 1 + LINK_SIZE;
new_recursive.after_call = ecode;
/* Now save the offset data. */
new_recursive.saved_max = md->offset_end;
if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
new_recursive.offset_save = stacksave;
else
{
new_recursive.offset_save =
(int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
}
memcpy(new_recursive.offset_save, md->offset_vector,
new_recursive.saved_max * sizeof(int));
new_recursive.save_start = mstart;
mstart = eptr;
/* OK, now we can do the recursion. For each top-level alternative we
restore the offset and recursion data. */
DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
do
{
RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
md, ims, eptrb, flags, RM6);
if (rrc == MATCH_MATCH)
{
DPRINTF(("Recursion matched\n"));
md->recursive = new_recursive.prevrec;
if (new_recursive.offset_save != stacksave)
(pcre_free)(new_recursive.offset_save);
RRETURN(MATCH_MATCH);
}
else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
{
DPRINTF(("Recursion gave error %d\n", rrc));
RRETURN(rrc);
}
md->recursive = &new_recursive;
memcpy(md->offset_vector, new_recursive.offset_save,
new_recursive.saved_max * sizeof(int));
callpat += GET(callpat, 1);
}
while (*callpat == OP_ALT);
DPRINTF(("Recursion didn't match\n"));
md->recursive = new_recursive.prevrec;
if (new_recursive.offset_save != stacksave)
(pcre_free)(new_recursive.offset_save);
RRETURN(MATCH_NOMATCH);
}
/* Control never reaches here */
/* "Once" brackets are like assertion brackets except that after a match,
the point in the subject string is not moved back. Thus there can never be
a move back into the brackets. Friedl calls these "atomic" subpatterns.
Check the alternative branches in turn - the matching won't pass the KET
for this kind of subpattern. If any one branch matches, we carry on as at
the end of a normal bracket, leaving the subject pointer. */
case OP_ONCE:
prev = ecode;
saved_eptr = eptr;
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
if (rrc == MATCH_MATCH) break;
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
/* If hit the end of the group (which could be repeated), fail */
if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
/* Continue as from after the assertion, updating the offsets high water
mark, since extracts may have been taken. */
do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
offset_top = md->end_offset_top;
eptr = md->end_match_ptr;
/* For a non-repeating ket, just continue at this level. This also
happens for a repeating ket if no characters were matched in the group.
This is the forcible breaking of infinite loops as implemented in Perl
5.005. If there is an options reset, it will get obeyed in the normal
course of events. */
if (*ecode == OP_KET || eptr == saved_eptr)
{
ecode += 1+LINK_SIZE;
break;
}
/* The repeating kets try the rest of the pattern or restart from the
preceding bracket, in the appropriate order. The second "call" of match()
uses tail recursion, to avoid using another stack frame. We need to reset
any options that changed within the bracket before re-running it, so
check the next opcode. */
if (ecode[1+LINK_SIZE] == OP_OPT)
{
ims = (ims & ~PCRE_IMS) | ecode[4];
DPRINTF(("ims set to %02lx at group repeat\n", ims));
}
if (*ecode == OP_KETRMIN)
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode = prev;
flags = 0;
goto TAIL_RECURSE;
}
else /* OP_KETRMAX */
{
RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += 1 + LINK_SIZE;
flags = 0;
goto TAIL_RECURSE;
}
/* Control never gets here */
/* An alternation is the end of a branch; scan along to find the end of the
bracketed group and go to there. */
case OP_ALT:
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
break;
/* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
indicating that it may occur zero times. It may repeat infinitely, or not
at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
with fixed upper repeat limits are compiled as a number of copies, with the
optional ones preceded by BRAZERO or BRAMINZERO. */
case OP_BRAZERO:
{
next = ecode+1;
RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
do next += GET(next,1); while (*next == OP_ALT);
ecode = next + 1 + LINK_SIZE;
}
break;
case OP_BRAMINZERO:
{
next = ecode+1;
do next += GET(next, 1); while (*next == OP_ALT);
RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode++;
}
break;
case OP_SKIPZERO:
{
next = ecode+1;
do next += GET(next,1); while (*next == OP_ALT);
ecode = next + 1 + LINK_SIZE;
}
break;
/* End of a group, repeated or non-repeating. */
case OP_KET:
case OP_KETRMIN:
case OP_KETRMAX:
prev = ecode - GET(ecode, 1);
/* If this was a group that remembered the subject start, in order to break
infinite repeats of empty string matches, retrieve the subject start from
the chain. Otherwise, set it NULL. */
if (*prev >= OP_SBRA)
{
saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
eptrb = eptrb->epb_prev; /* Backup to previous group */
}
else saved_eptr = NULL;
/* If we are at the end of an assertion group, stop matching and return
MATCH_MATCH, but record the current high water mark for use by positive
assertions. Do this also for the "once" (atomic) groups. */
if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
*prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
*prev == OP_ONCE)
{
md->end_match_ptr = eptr; /* For ONCE */
md->end_offset_top = offset_top;
RRETURN(MATCH_MATCH);
}
/* For capturing groups we have to check the group number back at the start
and if necessary complete handling an extraction by setting the offsets and
bumping the high water mark. Note that whole-pattern recursion is coded as
a recurse into group 0, so it won't be picked up here. Instead, we catch it
when the OP_END is reached. Other recursion is handled here. */
if (*prev == OP_CBRA || *prev == OP_SCBRA)
{
number = GET2(prev, 1+LINK_SIZE);
offset = number << 1;
#ifdef DEBUG
printf("end bracket %d", number);
printf("\n");
#endif
md->capture_last = number;
if (offset >= md->offset_max) md->offset_overflow = TRUE; else
{
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
md->offset_vector[offset+1] = eptr - md->start_subject;
if (offset_top <= offset) offset_top = offset + 2;
}
/* Handle a recursively called group. Restore the offsets
appropriately and continue from after the call. */
if (md->recursive != NULL && md->recursive->group_num == number)
{
recursion_info *rec = md->recursive;
DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
md->recursive = rec->prevrec;
mstart = rec->save_start;
memcpy(md->offset_vector, rec->offset_save,
rec->saved_max * sizeof(int));
ecode = rec->after_call;
ims = original_ims;
break;
}
}
/* For both capturing and non-capturing groups, reset the value of the ims
flags, in case they got changed during the group. */
ims = original_ims;
DPRINTF(("ims reset to %02lx\n", ims));
/* For a non-repeating ket, just continue at this level. This also
happens for a repeating ket if no characters were matched in the group.
This is the forcible breaking of infinite loops as implemented in Perl
5.005. If there is an options reset, it will get obeyed in the normal
course of events. */
if (*ecode == OP_KET || eptr == saved_eptr)
{
ecode += 1 + LINK_SIZE;
break;
}
/* The repeating kets try the rest of the pattern or restart from the
preceding bracket, in the appropriate order. In the second case, we can use
tail recursion to avoid using another stack frame, unless we have an
unlimited repeat of a group that can match an empty string. */
flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
if (*ecode == OP_KETRMIN)
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (flags != 0) /* Could match an empty string */
{
RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
RRETURN(rrc);
}
ecode = prev;
goto TAIL_RECURSE;
}
else /* OP_KETRMAX */
{
RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += 1 + LINK_SIZE;
flags = 0;
goto TAIL_RECURSE;
}
/* Control never gets here */
/* Start of subject unless notbol, or after internal newline if multiline */
case OP_CIRC:
if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
if ((ims & PCRE_MULTILINE) != 0)
{
if (eptr != md->start_subject &&
(eptr == md->end_subject || !WAS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
ecode++;
break;
}
/* ... else fall through */
/* Start of subject assertion */
case OP_SOD:
if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
ecode++;
break;
/* Start of match assertion */
case OP_SOM:
if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
ecode++;
break;
/* Reset the start of match point */
case OP_SET_SOM:
mstart = eptr;
ecode++;
break;
/* Assert before internal newline if multiline, or before a terminating
newline unless endonly is set, else end of subject unless noteol is set. */
case OP_DOLL:
if ((ims & PCRE_MULTILINE) != 0)
{
if (eptr < md->end_subject)
{ if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
else
{ if (md->noteol) RRETURN(MATCH_NOMATCH); }
ecode++;
break;
}
else
{
if (md->noteol) RRETURN(MATCH_NOMATCH);
if (!md->endonly)
{
if (eptr != md->end_subject &&
(!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
RRETURN(MATCH_NOMATCH);
ecode++;
break;
}
}
/* ... else fall through for endonly */
/* End of subject assertion (\z) */
case OP_EOD:
if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
ecode++;
break;
/* End of subject or ending \n assertion (\Z) */
case OP_EODN:
if (eptr != md->end_subject &&
(!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
RRETURN(MATCH_NOMATCH);
ecode++;
break;
/* Word boundary assertions */
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
{
/* Find out if the previous and current characters are "word" characters.
It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
be "non-word" characters. */
#ifdef SUPPORT_UTF8
if (utf8)
{
if (eptr == md->start_subject) prev_is_word = FALSE; else
{
const uschar *lastptr = eptr - 1;
while((*lastptr & 0xc0) == 0x80) lastptr--;
GETCHAR(c, lastptr);
prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
}
if (eptr >= md->end_subject) cur_is_word = FALSE; else
{
GETCHAR(c, eptr);
cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
}
}
else
#endif
/* More streamlined when not in UTF-8 mode */
{
prev_is_word = (eptr != md->start_subject) &&
((md->ctypes[eptr[-1]] & ctype_word) != 0);
cur_is_word = (eptr < md->end_subject) &&
((md->ctypes[*eptr] & ctype_word) != 0);
}
/* Now see if the situation is what we want */
if ((*ecode++ == OP_WORD_BOUNDARY)?
cur_is_word == prev_is_word : cur_is_word != prev_is_word)
RRETURN(MATCH_NOMATCH);
}
break;
/* Match a single character type; inline for speed */
case OP_ANY:
if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
/* Fall through */
case OP_ALLANY:
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
ecode++;
break;
/* Match a single byte, even in UTF-8 mode. This opcode really does match
any byte, even newline, independent of the setting of PCRE_DOTALL. */
case OP_ANYBYTE:
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
ecode++;
break;
case OP_NOT_DIGIT:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
if (
#ifdef SUPPORT_UTF8
c < 256 &&
#endif
(md->ctypes[c] & ctype_digit) != 0
)
RRETURN(MATCH_NOMATCH);
ecode++;
break;
case OP_DIGIT:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
if (
#ifdef SUPPORT_UTF8
c >= 256 ||
#endif
(md->ctypes[c] & ctype_digit) == 0
)
RRETURN(MATCH_NOMATCH);
ecode++;
break;
case OP_NOT_WHITESPACE:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
if (
#ifdef SUPPORT_UTF8
c < 256 &&
#endif
(md->ctypes[c] & ctype_space) != 0
)
RRETURN(MATCH_NOMATCH);
ecode++;
break;
case OP_WHITESPACE:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
if (
#ifdef SUPPORT_UTF8
c >= 256 ||
#endif
(md->ctypes[c] & ctype_space) == 0
)
RRETURN(MATCH_NOMATCH);
ecode++;
break;
case OP_NOT_WORDCHAR:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
if (
#ifdef SUPPORT_UTF8
c < 256 &&
#endif
(md->ctypes[c] & ctype_word) != 0
)
RRETURN(MATCH_NOMATCH);
ecode++;
break;
case OP_WORDCHAR:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
if (
#ifdef SUPPORT_UTF8
c >= 256 ||
#endif
(md->ctypes[c] & ctype_word) == 0
)
RRETURN(MATCH_NOMATCH);
ecode++;
break;
case OP_ANYNL:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
case 0x000a:
break;
case 0x000b:
case 0x000c:
case 0x0085:
case 0x2028:
case 0x2029:
if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
break;
}
ecode++;
break;
case OP_NOT_HSPACE:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
switch(c)
{
default: break;
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
RRETURN(MATCH_NOMATCH);
}
ecode++;
break;
case OP_HSPACE:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
break;
}
ecode++;
break;
case OP_NOT_VSPACE:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
switch(c)
{
default: break;
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
RRETURN(MATCH_NOMATCH);
}
ecode++;
break;
case OP_VSPACE:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
break;
}
ecode++;
break;
#ifdef SUPPORT_UCP
/* Check the next character by Unicode property. We will get here only
if the support is in the binary; otherwise a compile-time error occurs. */
case OP_PROP:
case OP_NOTPROP:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
{
int chartype, script;
int category = _pcre_ucp_findprop(c, &chartype, &script);
switch(ecode[1])
{
case PT_ANY:
if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
break;
case PT_LAMP:
if ((chartype == ucp_Lu ||
chartype == ucp_Ll ||
chartype == ucp_Lt) == (op == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
break;
case PT_GC:
if ((ecode[2] != category) == (op == OP_PROP))
RRETURN(MATCH_NOMATCH);
break;
case PT_PC:
if ((ecode[2] != chartype) == (op == OP_PROP))
RRETURN(MATCH_NOMATCH);
break;
case PT_SC:
if ((ecode[2] != script) == (op == OP_PROP))
RRETURN(MATCH_NOMATCH);
break;
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
ecode += 3;
}
break;
/* Match an extended Unicode sequence. We will get here only if the support
is in the binary; otherwise a compile-time error occurs. */
case OP_EXTUNI:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
{
int chartype, script;
int category = _pcre_ucp_findprop(c, &chartype, &script);
if (category == ucp_M) RRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
int len = 1;
if (!utf8) c = *eptr; else
{
GETCHARLEN(c, eptr, len);
}
category = _pcre_ucp_findprop(c, &chartype, &script);
if (category != ucp_M) break;
eptr += len;
}
}
ecode++;
break;
#endif
/* Match a back reference, possibly repeatedly. Look past the end of the
item to see if there is repeat information following. The code is similar
to that for character classes, but repeated for efficiency. Then obey
similar code to character type repeats - written out again for speed.
However, if the referenced string is the empty string, always treat
it as matched, any number of times (otherwise there could be infinite
loops). */
case OP_REF:
{
offset = GET2(ecode, 1) << 1; /* Doubled ref number */
ecode += 3;
/* If the reference is unset, there are two possibilities:
(a) In the default, Perl-compatible state, set the length to be longer
than the amount of subject left; this ensures that every attempt at a
match fails. We can't just fail here, because of the possibility of
quantifiers with zero minima.
(b) If the JavaScript compatibility flag is set, set the length to zero
so that the back reference matches an empty string.
Otherwise, set the length to the length of what was matched by the
referenced subpattern. */
if (offset >= offset_top || md->offset_vector[offset] < 0)
length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
else
length = md->offset_vector[offset+1] - md->offset_vector[offset];
/* Set up for repetition, or handle the non-repeated case */
switch (*ecode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRPLUS:
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
c = *ecode++ - OP_CRSTAR;
minimize = (c & 1) != 0;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
minimize = (*ecode == OP_CRMINRANGE);
min = GET2(ecode, 1);
max = GET2(ecode, 3);
if (max == 0) max = INT_MAX;
ecode += 5;
break;
default: /* No repeat follows */
if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
eptr += length;
continue; /* With the main loop */
}
/* If the length of the reference is zero, just continue with the
main loop. */
if (length == 0) continue;
/* First, ensure the minimum number of matches are present. We get back
the length of the reference string explicitly rather than passing the
address of eptr, so that eptr can be a register variable. */
for (i = 1; i <= min; i++)
{
if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
eptr += length;
}
/* If min = max, continue at the same level without recursion.
They are not both allowed to be zero. */
if (min == max) continue;
/* If minimizing, keep trying and advancing the pointer */
if (minimize)
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || !match_ref(offset, eptr, length, md, ims))
RRETURN(MATCH_NOMATCH);
eptr += length;
}
/* Control never gets here */
}
/* If maximizing, find the longest string and work backwards */
else
{
pp = eptr;
for (i = min; i < max; i++)
{
if (!match_ref(offset, eptr, length, md, ims)) break;
eptr += length;
}
while (eptr >= pp)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr -= length;
}
RRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
/* Match a bit-mapped character class, possibly repeatedly. This op code is
used when all the characters in the class have values in the range 0-255,
and either the matching is caseful, or the characters are in the range
0-127 when UTF-8 processing is enabled. The only difference between
OP_CLASS and OP_NCLASS occurs when a data character outside the range is
encountered.
First, look past the end of the item to see if there is repeat information
following. Then obey similar code to character type repeats - written out
again for speed. */
case OP_NCLASS:
case OP_CLASS:
{
data = ecode + 1; /* Save for matching */
ecode += 33; /* Advance past the item */
switch (*ecode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRPLUS:
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
c = *ecode++ - OP_CRSTAR;
minimize = (c & 1) != 0;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
minimize = (*ecode == OP_CRMINRANGE);
min = GET2(ecode, 1);
max = GET2(ecode, 3);
if (max == 0) max = INT_MAX;
ecode += 5;
break;
default: /* No repeat follows */
min = max = 1;
break;
}
/* First, ensure the minimum number of matches are present. */
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (utf8)
{
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
if (c > 255)
{
if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
}
else
{
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
}
}
}
else
#endif
/* Not UTF-8 mode */
{
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
c = *eptr++;
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
}
}
/* If max == min we can continue with the main loop without the
need to recurse. */
if (min == max) continue;
/* If minimizing, keep testing the rest of the expression and advancing
the pointer while it matches the class. */
if (minimize)
{
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (utf8)
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
if (c > 255)
{
if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
}
else
{
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
}
}
}
else
#endif
/* Not UTF-8 mode */
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
c = *eptr++;
if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
}
/* If maximizing, find the longest possible run, then work backwards. */
else
{
pp = eptr;
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (utf8)
{
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
if (c > 255)
{
if (op == OP_CLASS) break;
}
else
{
if ((data[c/8] & (1 << (c&7))) == 0) break;
}
eptr += len;
}
for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr-- == pp) break; /* Stop if tried at original pos */
BACKCHAR(eptr);
}
}
else
#endif
/* Not UTF-8 mode */
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
c = *eptr;
if ((data[c/8] & (1 << (c&7))) == 0) break;
eptr++;
}
while (eptr >= pp)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
}
}
RRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
/* Match an extended character class. This opcode is encountered only
in UTF-8 mode, because that's the only time it is compiled. */
#ifdef SUPPORT_UTF8
case OP_XCLASS:
{
data = ecode + 1 + LINK_SIZE; /* Save for matching */
ecode += GET(ecode, 1); /* Advance past the item */
switch (*ecode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRPLUS:
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
c = *ecode++ - OP_CRSTAR;
minimize = (c & 1) != 0;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
minimize = (*ecode == OP_CRMINRANGE);
min = GET2(ecode, 1);
max = GET2(ecode, 3);
if (max == 0) max = INT_MAX;
ecode += 5;
break;
default: /* No repeat follows */
min = max = 1;
break;
}
/* First, ensure the minimum number of matches are present. */
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
}
/* If max == min we can continue with the main loop without the
need to recurse. */
if (min == max) continue;
/* If minimizing, keep testing the rest of the expression and advancing
the pointer while it matches the class. */
if (minimize)
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
/* If maximizing, find the longest possible run, then work backwards. */
else
{
pp = eptr;
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
if (!_pcre_xclass(c, data)) break;
eptr += len;
}
for(;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr-- == pp) break; /* Stop if tried at original pos */
if (utf8) BACKCHAR(eptr);
}
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
#endif /* End of XCLASS */
/* Match a single character, casefully */
case OP_CHAR:
#ifdef SUPPORT_UTF8
if (utf8)
{
length = 1;
ecode++;
GETCHARLEN(fc, ecode, length);
if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
}
else
#endif
/* Non-UTF-8 mode */
{
if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
ecode += 2;
}
break;
/* Match a single character, caselessly */
case OP_CHARNC:
#ifdef SUPPORT_UTF8
if (utf8)
{
length = 1;
ecode++;
GETCHARLEN(fc, ecode, length);
if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
/* If the pattern character's value is < 128, we have only one byte, and
can use the fast lookup table. */
if (fc < 128)
{
if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
}
/* Otherwise we must pick up the subject character */
else
{
unsigned int dc;
GETCHARINC(dc, eptr);
ecode += length;
/* If we have Unicode property support, we can use it to test the other
case of the character, if there is one. */
if (fc != dc)
{
#ifdef SUPPORT_UCP
if (dc != _pcre_ucp_othercase(fc))
#endif
RRETURN(MATCH_NOMATCH);
}
}
}
else
#endif /* SUPPORT_UTF8 */
/* Non-UTF-8 mode */
{
if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
ecode += 2;
}
break;
/* Match a single character repeatedly. */
case OP_EXACT:
min = max = GET2(ecode, 1);
ecode += 3;
goto REPEATCHAR;
case OP_POSUPTO:
possessive = TRUE;
/* Fall through */
case OP_UPTO:
case OP_MINUPTO:
min = 0;
max = GET2(ecode, 1);
minimize = *ecode == OP_MINUPTO;
ecode += 3;
goto REPEATCHAR;
case OP_POSSTAR:
possessive = TRUE;
min = 0;
max = INT_MAX;
ecode++;
goto REPEATCHAR;
case OP_POSPLUS:
possessive = TRUE;
min = 1;
max = INT_MAX;
ecode++;
goto REPEATCHAR;
case OP_POSQUERY:
possessive = TRUE;
min = 0;
max = 1;
ecode++;
goto REPEATCHAR;
case OP_STAR:
case OP_MINSTAR:
case OP_PLUS:
case OP_MINPLUS:
case OP_QUERY:
case OP_MINQUERY:
c = *ecode++ - OP_STAR;
minimize = (c & 1) != 0;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
/* Common code for all repeated single-character matches. We can give
up quickly if there are fewer than the minimum number of characters left in
the subject. */
REPEATCHAR:
#ifdef SUPPORT_UTF8
if (utf8)
{
length = 1;
charptr = ecode;
GETCHARLEN(fc, ecode, length);
if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
ecode += length;
/* Handle multibyte character matching specially here. There is
support for caseless matching if UCP support is present. */
if (length > 1)
{
#ifdef SUPPORT_UCP
unsigned int othercase;
if ((ims & PCRE_CASELESS) != 0 &&
(othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
oclength = _pcre_ord2utf8(othercase, occhars);
else oclength = 0;
#endif /* SUPPORT_UCP */
for (i = 1; i <= min; i++)
{
if (memcmp(eptr, charptr, length) == 0) eptr += length;
#ifdef SUPPORT_UCP
/* Need braces because of following else */
else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
else
{
if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
eptr += oclength;
}
#else /* without SUPPORT_UCP */
else { RRETURN(MATCH_NOMATCH); }
#endif /* SUPPORT_UCP */
}
if (min == max) continue;
if (minimize)
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
if (memcmp(eptr, charptr, length) == 0) eptr += length;
#ifdef SUPPORT_UCP
/* Need braces because of following else */
else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
else
{
if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
eptr += oclength;
}
#else /* without SUPPORT_UCP */
else { RRETURN (MATCH_NOMATCH); }
#endif /* SUPPORT_UCP */
}
/* Control never gets here */
}
else /* Maximize */
{
pp = eptr;
for (i = min; i < max; i++)
{
if (eptr > md->end_subject - length) break;
if (memcmp(eptr, charptr, length) == 0) eptr += length;
#ifdef SUPPORT_UCP
else if (oclength == 0) break;
else
{
if (memcmp(eptr, occhars, oclength) != 0) break;
eptr += oclength;
}
#else /* without SUPPORT_UCP */
else break;
#endif /* SUPPORT_UCP */
}
if (possessive) continue;
for(;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr == pp) RRETURN(MATCH_NOMATCH);
#ifdef SUPPORT_UCP
eptr--;
BACKCHAR(eptr);
#else /* without SUPPORT_UCP */
eptr -= length;
#endif /* SUPPORT_UCP */
}
}
/* Control never gets here */
}
/* If the length of a UTF-8 character is 1, we fall through here, and
obey the code as for non-UTF-8 characters below, though in this case the
value of fc will always be < 128. */
}
else
#endif /* SUPPORT_UTF8 */
/* When not in UTF-8 mode, load a single-byte character. */
{
if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
fc = *ecode++;
}
/* The value of fc at this point is always less than 256, though we may or
may not be in UTF-8 mode. The code is duplicated for the caseless and
caseful cases, for speed, since matching characters is likely to be quite
common. First, ensure the minimum number of matches are present. If min =
max, continue at the same level without recursing. Otherwise, if
minimizing, keep trying the rest of the expression and advancing one
matching character if failing, up to the maximum. Alternatively, if
maximizing, find the maximum number of characters and work backwards. */
DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
max, eptr));
if ((ims & PCRE_CASELESS) != 0)
{
fc = md->lcc[fc];
for (i = 1; i <= min; i++)
if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
if (min == max) continue;
if (minimize)
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject ||
fc != md->lcc[*eptr++])
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
else /* Maximize */
{
pp = eptr;
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
eptr++;
}
if (possessive) continue;
while (eptr >= pp)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
/* Caseful comparisons (includes all multi-byte characters) */
else
{
for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
if (min == max) continue;
if (minimize)
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
else /* Maximize */
{
pp = eptr;
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || fc != *eptr) break;
eptr++;
}
if (possessive) continue;
while (eptr >= pp)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
RRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
/* Match a negated single one-byte character. The character we are
checking can be multibyte. */
case OP_NOT:
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
ecode++;
GETCHARINCTEST(c, eptr);
if ((ims & PCRE_CASELESS) != 0)
{
#ifdef SUPPORT_UTF8
if (c < 256)
#endif
c = md->lcc[c];
if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
}
else
{
if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
}
break;
/* Match a negated single one-byte character repeatedly. This is almost a
repeat of the code for a repeated single character, but I haven't found a
nice way of commoning these up that doesn't require a test of the
positive/negative option for each character match. Maybe that wouldn't add
very much to the time taken, but character matching *is* what this is all
about... */
case OP_NOTEXACT:
min = max = GET2(ecode, 1);
ecode += 3;
goto REPEATNOTCHAR;
case OP_NOTUPTO:
case OP_NOTMINUPTO:
min = 0;
max = GET2(ecode, 1);
minimize = *ecode == OP_NOTMINUPTO;
ecode += 3;
goto REPEATNOTCHAR;
case OP_NOTPOSSTAR:
possessive = TRUE;
min = 0;
max = INT_MAX;
ecode++;
goto REPEATNOTCHAR;
case OP_NOTPOSPLUS:
possessive = TRUE;
min = 1;
max = INT_MAX;
ecode++;
goto REPEATNOTCHAR;
case OP_NOTPOSQUERY:
possessive = TRUE;
min = 0;
max = 1;
ecode++;
goto REPEATNOTCHAR;
case OP_NOTPOSUPTO:
possessive = TRUE;
min = 0;
max = GET2(ecode, 1);
ecode += 3;
goto REPEATNOTCHAR;
case OP_NOTSTAR:
case OP_NOTMINSTAR:
case OP_NOTPLUS:
case OP_NOTMINPLUS:
case OP_NOTQUERY:
case OP_NOTMINQUERY:
c = *ecode++ - OP_NOTSTAR;
minimize = (c & 1) != 0;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
/* Common code for all repeated single-byte matches. We can give up quickly
if there are fewer than the minimum number of bytes left in the
subject. */
REPEATNOTCHAR:
if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
fc = *ecode++;
/* The code is duplicated for the caseless and caseful cases, for speed,
since matching characters is likely to be quite common. First, ensure the
minimum number of matches are present. If min = max, continue at the same
level without recursing. Otherwise, if minimizing, keep trying the rest of
the expression and advancing one matching character if failing, up to the
maximum. Alternatively, if maximizing, find the maximum number of
characters and work backwards. */
DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
max, eptr));
if ((ims & PCRE_CASELESS) != 0)
{
fc = md->lcc[fc];
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (utf8)
{
register unsigned int d;
for (i = 1; i <= min; i++)
{
GETCHARINC(d, eptr);
if (d < 256) d = md->lcc[d];
if (fc == d) RRETURN(MATCH_NOMATCH);
}
}
else
#endif
/* Not UTF-8 mode */
{
for (i = 1; i <= min; i++)
if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
}
if (min == max) continue;
if (minimize)
{
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (utf8)
{
register unsigned int d;
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
GETCHARINC(d, eptr);
if (d < 256) d = md->lcc[d];
if (fi >= max || eptr >= md->end_subject || fc == d)
RRETURN(MATCH_NOMATCH);
}
}
else
#endif
/* Not UTF-8 mode */
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
RRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
}
/* Maximize case */
else
{
pp = eptr;
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (utf8)
{
register unsigned int d;
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(d, eptr, len);
if (d < 256) d = md->lcc[d];
if (fc == d) break;
eptr += len;
}
if (possessive) continue;
for(;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr-- == pp) break; /* Stop if tried at original pos */
BACKCHAR(eptr);
}
}
else
#endif
/* Not UTF-8 mode */
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
eptr++;
}
if (possessive) continue;
while (eptr >= pp)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
}
}
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
/* Caseful comparisons */
else
{
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (utf8)
{
register unsigned int d;
for (i = 1; i <= min; i++)
{
GETCHARINC(d, eptr);
if (fc == d) RRETURN(MATCH_NOMATCH);
}
}
else
#endif
/* Not UTF-8 mode */
{
for (i = 1; i <= min; i++)
if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
}
if (min == max) continue;
if (minimize)
{
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (utf8)
{
register unsigned int d;
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
GETCHARINC(d, eptr);
if (fi >= max || eptr >= md->end_subject || fc == d)
RRETURN(MATCH_NOMATCH);
}
}
else
#endif
/* Not UTF-8 mode */
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
RRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
}
/* Maximize case */
else
{
pp = eptr;
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (utf8)
{
register unsigned int d;
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(d, eptr, len);
if (fc == d) break;
eptr += len;
}
if (possessive) continue;
for(;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr-- == pp) break; /* Stop if tried at original pos */
BACKCHAR(eptr);
}
}
else
#endif
/* Not UTF-8 mode */
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || fc == *eptr) break;
eptr++;
}
if (possessive) continue;
while (eptr >= pp)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
}
}
RRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
/* Match a single character type repeatedly; several different opcodes
share code. This is very similar to the code for single characters, but we
repeat it in the interests of efficiency. */
case OP_TYPEEXACT:
min = max = GET2(ecode, 1);
minimize = TRUE;
ecode += 3;
goto REPEATTYPE;
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
min = 0;
max = GET2(ecode, 1);
minimize = *ecode == OP_TYPEMINUPTO;
ecode += 3;
goto REPEATTYPE;
case OP_TYPEPOSSTAR:
possessive = TRUE;
min = 0;
max = INT_MAX;
ecode++;
goto REPEATTYPE;
case OP_TYPEPOSPLUS:
possessive = TRUE;
min = 1;
max = INT_MAX;
ecode++;
goto REPEATTYPE;
case OP_TYPEPOSQUERY:
possessive = TRUE;
min = 0;
max = 1;
ecode++;
goto REPEATTYPE;
case OP_TYPEPOSUPTO:
possessive = TRUE;
min = 0;
max = GET2(ecode, 1);
ecode += 3;
goto REPEATTYPE;
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
c = *ecode++ - OP_TYPESTAR;
minimize = (c & 1) != 0;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
/* Common code for all repeated single character type matches. Note that
in UTF-8 mode, '.' matches a character of any length, but for the other
character types, the valid characters are all one-byte long. */
REPEATTYPE:
ctype = *ecode++; /* Code for the character type */
#ifdef SUPPORT_UCP
if (ctype == OP_PROP || ctype == OP_NOTPROP)
{
prop_fail_result = ctype == OP_NOTPROP;
prop_type = *ecode++;
prop_value = *ecode++;
}
else prop_type = -1;
#endif
/* First, ensure the minimum number of matches are present. Use inline
code for maximizing the speed, and do the type test once at the start
(i.e. keep it out of the loop). Also we can test that there are at least
the minimum number of bytes before we start. This isn't as effective in
UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
is tidier. Also separate the UCP code, which can be the same for both UTF-8
and single-bytes. */
if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
if (min > 0)
{
#ifdef SUPPORT_UCP
if (prop_type >= 0)
{
switch(prop_type)
{
case PT_ANY:
if (prop_fail_result) RRETURN(MATCH_NOMATCH);
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
}
break;
case PT_LAMP:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
prop_chartype == ucp_Lt) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
break;
case PT_GC:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_category == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
break;
case PT_PC:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_chartype == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
break;
case PT_SC:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_script == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
break;
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
}
/* Match extended Unicode sequences. We will get here only if the
support is in the binary; otherwise a compile-time error occurs. */
else if (ctype == OP_EXTUNI)
{
for (i = 1; i <= min; i++)
{
GETCHARINCTEST(c, eptr);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
int len = 1;
if (!utf8) c = *eptr; else
{
GETCHARLEN(c, eptr, len);
}
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if (prop_category != ucp_M) break;
eptr += len;
}
}
}
else
#endif /* SUPPORT_UCP */
/* Handle all other cases when the coding is UTF-8 */
#ifdef SUPPORT_UTF8
if (utf8) switch(ctype)
{
case OP_ANY:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject || IS_NEWLINE(eptr))
RRETURN(MATCH_NOMATCH);
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
break;
case OP_ALLANY:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
break;
case OP_ANYBYTE:
eptr += min;
break;
case OP_ANYNL:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
case 0x000a:
break;
case 0x000b:
case 0x000c:
case 0x0085:
case 0x2028:
case 0x2029:
if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
break;
}
}
break;
case OP_NOT_HSPACE:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(c)
{
default: break;
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
RRETURN(MATCH_NOMATCH);
}
}
break;
case OP_HSPACE:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
break;
}
}
break;
case OP_NOT_VSPACE:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(c)
{
default: break;
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
RRETURN(MATCH_NOMATCH);
}
}
break;
case OP_VSPACE:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
break;
}
}
break;
case OP_NOT_DIGIT:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
RRETURN(MATCH_NOMATCH);
}
break;
case OP_DIGIT:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject ||
*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
RRETURN(MATCH_NOMATCH);
/* No need to skip more bytes - we know it's a 1-byte character */
}
break;
case OP_NOT_WHITESPACE:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject ||
(*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
RRETURN(MATCH_NOMATCH);
while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
}
break;
case OP_WHITESPACE:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject ||
*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
RRETURN(MATCH_NOMATCH);
/* No need to skip more bytes - we know it's a 1-byte character */
}
break;
case OP_NOT_WORDCHAR:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject ||
(*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
RRETURN(MATCH_NOMATCH);
while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
}
break;
case OP_WORDCHAR:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject ||
*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
RRETURN(MATCH_NOMATCH);
/* No need to skip more bytes - we know it's a 1-byte character */
}
break;
default:
RRETURN(PCRE_ERROR_INTERNAL);
} /* End switch(ctype) */
else
#endif /* SUPPORT_UTF8 */
/* Code for the non-UTF-8 case for minimum matching of operators other
than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
number of bytes present, as this was tested above. */
switch(ctype)
{
case OP_ANY:
for (i = 1; i <= min; i++)
{
if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
eptr++;
}
break;
case OP_ALLANY:
eptr += min;
break;
case OP_ANYBYTE:
eptr += min;
break;
/* Because of the CRLF case, we can't assume the minimum number of
bytes are present in this case. */
case OP_ANYNL:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
switch(*eptr++)
{
default: RRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
case 0x000a:
break;
case 0x000b:
case 0x000c:
case 0x0085:
if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
break;
}
}
break;
case OP_NOT_HSPACE:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
switch(*eptr++)
{
default: break;
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
RRETURN(MATCH_NOMATCH);
}
}
break;
case OP_HSPACE:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
switch(*eptr++)
{
default: RRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
break;
}
}
break;
case OP_NOT_VSPACE:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
switch(*eptr++)
{
default: break;
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
RRETURN(MATCH_NOMATCH);
}
}
break;
case OP_VSPACE:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
switch(*eptr++)
{
default: RRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
break;
}
}
break;
case OP_NOT_DIGIT:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
break;
case OP_DIGIT:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WHITESPACE:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
break;
case OP_WHITESPACE:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WORDCHAR:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_word) != 0)
RRETURN(MATCH_NOMATCH);
break;
case OP_WORDCHAR:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_word) == 0)
RRETURN(MATCH_NOMATCH);
break;
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
}
/* If min = max, continue at the same level without recursing */
if (min == max) continue;
/* If minimizing, we have to test the rest of the pattern before each
subsequent match. Again, separate the UTF-8 case for speed, and also
separate the UCP cases. */
if (minimize)
{
#ifdef SUPPORT_UCP
if (prop_type >= 0)
{
switch(prop_type)
{
case PT_ANY:
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
if (prop_fail_result) RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
case PT_LAMP:
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
prop_chartype == ucp_Lt) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
case PT_GC:
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_category == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
case PT_PC:
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_chartype == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
case PT_SC:
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_script == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
}
/* Match extended Unicode sequences. We will get here only if the
support is in the binary; otherwise a compile-time error occurs. */
else if (ctype == OP_EXTUNI)
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
int len = 1;
if (!utf8) c = *eptr; else
{
GETCHARLEN(c, eptr, len);
}
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if (prop_category != ucp_M) break;
eptr += len;
}
}
}
else
#endif /* SUPPORT_UCP */
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (utf8)
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject ||
(ctype == OP_ANY && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(ctype)
{
case OP_ANY: /* This is the non-NL case */
case OP_ALLANY:
case OP_ANYBYTE:
break;
case OP_ANYNL:
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
case 0x000a:
break;
case 0x000b:
case 0x000c:
case 0x0085:
case 0x2028:
case 0x2029:
if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
break;
}
break;
case OP_NOT_HSPACE:
switch(c)
{
default: break;
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
RRETURN(MATCH_NOMATCH);
}
break;
case OP_HSPACE:
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
break;
}
break;
case OP_NOT_VSPACE:
switch(c)
{
default: break;
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
RRETURN(MATCH_NOMATCH);
}
break;
case OP_VSPACE:
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
break;
}
break;
case OP_NOT_DIGIT:
if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
RRETURN(MATCH_NOMATCH);
break;
case OP_DIGIT:
if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
RRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WHITESPACE:
if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
RRETURN(MATCH_NOMATCH);
break;
case OP_WHITESPACE:
if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
RRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WORDCHAR:
if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
RRETURN(MATCH_NOMATCH);
break;
case OP_WORDCHAR:
if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
RRETURN(MATCH_NOMATCH);
break;
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
}
}
else
#endif
/* Not UTF-8 mode */
{
for (fi = min;; fi++)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject ||
(ctype == OP_ANY && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
c = *eptr++;
switch(ctype)
{
case OP_ANY: /* This is the non-NL case */
case OP_ALLANY:
case OP_ANYBYTE:
break;
case OP_ANYNL:
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
case 0x000a:
break;
case 0x000b:
case 0x000c:
case 0x0085:
if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
break;
}
break;
case OP_NOT_HSPACE:
switch(c)
{
default: break;
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
RRETURN(MATCH_NOMATCH);
}
break;
case OP_HSPACE:
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
break;
}
break;
case OP_NOT_VSPACE:
switch(c)
{
default: break;
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
RRETURN(MATCH_NOMATCH);
}
break;
case OP_VSPACE:
switch(c)
{
default: RRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
break;
}
break;
case OP_NOT_DIGIT:
if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
break;
case OP_DIGIT:
if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WHITESPACE:
if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
break;
case OP_WHITESPACE:
if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WORDCHAR:
if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
break;
case OP_WORDCHAR:
if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
break;
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
}
}
/* Control never gets here */
}
/* If maximizing, it is worth using inline code for speed, doing the type
test once at the start (i.e. keep it out of the loop). Again, keep the
UTF-8 and UCP stuff separate. */
else
{
pp = eptr; /* Remember where we started */
#ifdef SUPPORT_UCP
if (prop_type >= 0)
{
switch(prop_type)
{
case PT_ANY:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
if (prop_fail_result) break;
eptr+= len;
}
break;
case PT_LAMP:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
prop_chartype == ucp_Lt) == prop_fail_result)
break;
eptr+= len;
}
break;
case PT_GC:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_category == prop_value) == prop_fail_result)
break;
eptr+= len;
}
break;
case PT_PC:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_chartype == prop_value) == prop_fail_result)
break;
eptr+= len;
}
break;
case PT_SC:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if ((prop_script == prop_value) == prop_fail_result)
break;
eptr+= len;
}
break;
}
/* eptr is now past the end of the maximum run */
if (possessive) continue;
for(;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr-- == pp) break; /* Stop if tried at original pos */
if (utf8) BACKCHAR(eptr);
}
}
/* Match extended Unicode sequences. We will get here only if the
support is in the binary; otherwise a compile-time error occurs. */
else if (ctype == OP_EXTUNI)
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
GETCHARINCTEST(c, eptr);
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if (prop_category == ucp_M) break;
while (eptr < md->end_subject)
{
int len = 1;
if (!utf8) c = *eptr; else
{
GETCHARLEN(c, eptr, len);
}
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if (prop_category != ucp_M) break;
eptr += len;
}
}
/* eptr is now past the end of the maximum run */
if (possessive) continue;
for(;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr-- == pp) break; /* Stop if tried at original pos */
for (;;) /* Move back over one extended */
{
int len = 1;
if (!utf8) c = *eptr; else
{
BACKCHAR(eptr);
GETCHARLEN(c, eptr, len);
}
prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
if (prop_category != ucp_M) break;
eptr--;
}
}
}
else
#endif /* SUPPORT_UCP */
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (utf8)
{
switch(ctype)
{
case OP_ANY:
if (max < INT_MAX)
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
/* Handle unlimited UTF-8 repeat */
else
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
break;
case OP_ALLANY:
if (max < INT_MAX)
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
break;
/* The byte case is the same as non-UTF8 */
case OP_ANYBYTE:
c = max - min;
if (c > (unsigned int)(md->end_subject - eptr))
c = md->end_subject - eptr;
eptr += c;
break;
case OP_ANYNL:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
if (c == 0x000d)
{
if (++eptr >= md->end_subject) break;
if (*eptr == 0x000a) eptr++;
}
else
{
if (c != 0x000a &&
(md->bsr_anycrlf ||
(c != 0x000b && c != 0x000c &&
c != 0x0085 && c != 0x2028 && c != 0x2029)))
break;
eptr += len;
}
}
break;
case OP_NOT_HSPACE:
case OP_HSPACE:
for (i = min; i < max; i++)
{
BOOL gotspace;
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
switch(c)
{
default: gotspace = FALSE; break;
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
case 0x1680: /* OGHAM SPACE MARK */
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
case 0x2000: /* EN QUAD */
case 0x2001: /* EM QUAD */
case 0x2002: /* EN SPACE */
case 0x2003: /* EM SPACE */
case 0x2004: /* THREE-PER-EM SPACE */
case 0x2005: /* FOUR-PER-EM SPACE */
case 0x2006: /* SIX-PER-EM SPACE */
case 0x2007: /* FIGURE SPACE */
case 0x2008: /* PUNCTUATION SPACE */
case 0x2009: /* THIN SPACE */
case 0x200A: /* HAIR SPACE */
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
gotspace = TRUE;
break;
}
if (gotspace == (ctype == OP_NOT_HSPACE)) break;
eptr += len;
}
break;
case OP_NOT_VSPACE:
case OP_VSPACE:
for (i = min; i < max; i++)
{
BOOL gotspace;
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
switch(c)
{
default: gotspace = FALSE; break;
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
gotspace = TRUE;
break;
}
if (gotspace == (ctype == OP_NOT_VSPACE)) break;
eptr += len;
}
break;
case OP_NOT_DIGIT:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
eptr+= len;
}
break;
case OP_DIGIT:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
eptr+= len;
}
break;
case OP_NOT_WHITESPACE:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
eptr+= len;
}
break;
case OP_WHITESPACE:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
eptr+= len;
}
break;
case OP_NOT_WORDCHAR:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
eptr+= len;
}
break;
case OP_WORDCHAR:
for (i = min; i < max; i++)
{
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
eptr+= len;
}
break;
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
/* eptr is now past the end of the maximum run */
if (possessive) continue;
for(;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr-- == pp) break; /* Stop if tried at original pos */
BACKCHAR(eptr);
}
}
else
#endif /* SUPPORT_UTF8 */
/* Not UTF-8 mode */
{
switch(ctype)
{
case OP_ANY:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
eptr++;
}
break;
case OP_ALLANY:
case OP_ANYBYTE:
c = max - min;
if (c > (unsigned int)(md->end_subject - eptr))
c = md->end_subject - eptr;
eptr += c;
break;
case OP_ANYNL:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
c = *eptr;
if (c == 0x000d)
{
if (++eptr >= md->end_subject) break;
if (*eptr == 0x000a) eptr++;
}
else
{
if (c != 0x000a &&
(md->bsr_anycrlf ||
(c != 0x000b && c != 0x000c && c != 0x0085)))
break;
eptr++;
}
}
break;
case OP_NOT_HSPACE:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
c = *eptr;
if (c == 0x09 || c == 0x20 || c == 0xa0) break;
eptr++;
}
break;
case OP_HSPACE:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
c = *eptr;
if (c != 0x09 && c != 0x20 && c != 0xa0) break;
eptr++;
}
break;
case OP_NOT_VSPACE:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
c = *eptr;
if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
break;
eptr++;
}
break;
case OP_VSPACE:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
c = *eptr;
if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
break;
eptr++;
}
break;
case OP_NOT_DIGIT:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
break;
eptr++;
}
break;
case OP_DIGIT:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
break;
eptr++;
}
break;
case OP_NOT_WHITESPACE:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
break;
eptr++;
}
break;
case OP_WHITESPACE:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
break;
eptr++;
}
break;
case OP_NOT_WORDCHAR:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
break;
eptr++;
}
break;
case OP_WORDCHAR:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
break;
eptr++;
}
break;
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
/* eptr is now past the end of the maximum run */
if (possessive) continue;
while (eptr >= pp)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
}
/* Get here if we can't make it match with any permitted repetitions */
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
/* There's been some horrible disaster. Arrival here can only mean there is
something seriously wrong in the code above or the OP_xxx definitions. */
default:
DPRINTF(("Unknown opcode %d\n", *ecode));
RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
}
/* Do not stick any code in here without much thought; it is assumed
that "continue" in the code above comes out to here to repeat the main
loop. */
} /* End of main loop */
/* Control never reaches here */
/* When compiling to use the heap rather than the stack for recursive calls to
match(), the RRETURN() macro jumps here. The number that is saved in
frame->Xwhere indicates which label we actually want to return to. */
#ifdef NO_RECURSE
#define LBL(val) case val: goto L_RM##val;
HEAP_RETURN:
switch (frame->Xwhere)
{
LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
LBL(53) LBL(54)
#ifdef SUPPORT_UTF8
LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
LBL(32) LBL(34) LBL(42) LBL(46)
#ifdef SUPPORT_UCP
LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
#endif /* SUPPORT_UCP */
#endif /* SUPPORT_UTF8 */
default:
DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
return PCRE_ERROR_INTERNAL;
}
#undef LBL
#endif /* NO_RECURSE */
}
/***************************************************************************
****************************************************************************
RECURSION IN THE match() FUNCTION
Undefine all the macros that were defined above to handle this. */
#ifdef NO_RECURSE
#undef eptr
#undef ecode
#undef mstart
#undef offset_top
#undef ims
#undef eptrb
#undef flags
#undef callpat
#undef charptr
#undef data
#undef next
#undef pp
#undef prev
#undef saved_eptr
#undef new_recursive
#undef cur_is_word
#undef condition
#undef prev_is_word
#undef original_ims
#undef ctype
#undef length
#undef max
#undef min
#undef number
#undef offset
#undef op
#undef save_capture_last
#undef save_offset1
#undef save_offset2
#undef save_offset3
#undef stacksave
#undef newptrb
#endif
/* These two are defined as macros in both cases */
#undef fc
#undef fi
/***************************************************************************
***************************************************************************/
/*************************************************
* Execute a Regular Expression *
*************************************************/
| pcreexec.c | 401 |
PCRE_EXP_DEFN INT | pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, int offsetcount)
PCRE_EXP_DEFN int
pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
int offsetcount)
{
int rc, resetcount, ocount;
int first_byte = -1;
int req_byte = -1;
int req_byte2 = -1;
int newline;
unsigned long int ims;
BOOL using_temporary_offsets = FALSE;
BOOL anchored;
BOOL startline;
BOOL firstline;
BOOL first_byte_caseless = FALSE;
BOOL req_byte_caseless = FALSE;
BOOL utf8;
match_data match_block;
match_data *md = &match_block;
const uschar *tables;
const uschar *start_bits = NULL;
USPTR start_match = (USPTR)subject + start_offset;
USPTR end_subject;
USPTR req_byte_ptr = start_match - 1;
pcre_study_data internal_study;
const pcre_study_data *study;
real_pcre internal_re;
const real_pcre *external_re = (const real_pcre *)argument_re;
const real_pcre *re = external_re;
/* Plausibility checks */
if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
if (re == NULL || subject == NULL ||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
/* Fish out the optional data from the extra_data structure, first setting
the default values. */
study = NULL;
md->match_limit = MATCH_LIMIT;
md->match_limit_recursion = MATCH_LIMIT_RECURSION;
md->callout_data = NULL;
/* The table pointer is always in native byte order. */
tables = external_re->tables;
if (extra_data != NULL)
{
register unsigned int flags = extra_data->flags;
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
study = (const pcre_study_data *)extra_data->study_data;
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
md->match_limit = extra_data->match_limit;
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
md->match_limit_recursion = extra_data->match_limit_recursion;
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
md->callout_data = extra_data->callout_data;
if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
}
/* If the exec call supplied NULL for tables, use the inbuilt ones. This
is a feature that makes it possible to save compiled regex and re-use them
in other programs later. */
if (tables == NULL) tables = _pcre_default_tables;
/* Check that the first field in the block is the magic number. If it is not,
test for a regex that was compiled on a host of opposite endianness. If this is
the case, flipped values are put in internal_re and internal_study if there was
study data too. */
if (re->magic_number != MAGIC_NUMBER)
{
re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
if (re == NULL) return PCRE_ERROR_BADMAGIC;
if (study != NULL) study = &internal_study;
}
/* Set up other data */
anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
startline = (re->flags & PCRE_STARTLINE) != 0;
firstline = (re->options & PCRE_FIRSTLINE) != 0;
/* The code starts after the real_pcre block and the capture name table. */
md->start_code = (const uschar *)external_re + re->name_table_offset +
re->name_count * re->name_entry_size;
md->start_subject = (USPTR)subject;
md->start_offset = start_offset;
md->end_subject = md->start_subject + length;
end_subject = md->end_subject;
md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
md->notbol = (options & PCRE_NOTBOL) != 0;
md->noteol = (options & PCRE_NOTEOL) != 0;
md->notempty = (options & PCRE_NOTEMPTY) != 0;
md->partial = (options & PCRE_PARTIAL) != 0;
md->hitend = FALSE;
md->recursive = NULL; /* No recursion at top level */
md->lcc = tables + lcc_offset;
md->ctypes = tables + ctypes_offset;
/* Handle different \R options. */
switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
{
case 0:
if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
else
#ifdef BSR_ANYCRLF
md->bsr_anycrlf = TRUE;
#else
md->bsr_anycrlf = FALSE;
#endif
break;
case PCRE_BSR_ANYCRLF:
md->bsr_anycrlf = TRUE;
break;
case PCRE_BSR_UNICODE:
md->bsr_anycrlf = FALSE;
break;
default: return PCRE_ERROR_BADNEWLINE;
}
/* Handle different types of newline. The three bits give eight cases. If
nothing is set at run time, whatever was used at compile time applies. */
switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
(pcre_uint32)options) & PCRE_NEWLINE_BITS)
{
case 0: newline = NEWLINE; break; /* Compile-time default */
case PCRE_NEWLINE_CR: newline = '\r'; break;
case PCRE_NEWLINE_LF: newline = '\n'; break;
case PCRE_NEWLINE_CR+
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
case PCRE_NEWLINE_ANY: newline = -1; break;
case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
default: return PCRE_ERROR_BADNEWLINE;
}
if (newline == -2)
{
md->nltype = NLTYPE_ANYCRLF;
}
else if (newline < 0)
{
md->nltype = NLTYPE_ANY;
}
else
{
md->nltype = NLTYPE_FIXED;
if (newline > 255)
{
md->nllen = 2;
md->nl[0] = (newline >> 8) & 255;
md->nl[1] = newline & 255;
}
else
{
md->nllen = 1;
md->nl[0] = newline;
}
}
/* Partial matching is supported only for a restricted set of regexes at the
moment. */
if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
return PCRE_ERROR_BADPARTIAL;
/* Check a UTF-8 string if required. Unfortunately there's no way of passing
back the character offset. */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
return PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
{
int tb = ((uschar *)subject)[start_offset];
if (tb > 127)
{
tb &= 0xc0;
if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
}
}
}
#endif
/* The ims options can vary during the matching as a result of the presence
of (?ims) items in the pattern. They are kept in a local variable so that
restoring at the exit of a group is easy. */
ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
/* If the expression has got more back references than the offsets supplied can
hold, we get a temporary chunk of working store to use during the matching.
Otherwise, we can use the vector supplied, rounding down its size to a multiple
of 3. */
ocount = offsetcount - (offsetcount % 3);
if (re->top_backref > 0 && re->top_backref >= ocount/3)
{
ocount = re->top_backref * 3 + 3;
md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
using_temporary_offsets = TRUE;
DPRINTF(("Got memory to hold back references\n"));
}
else md->offset_vector = offsets;
md->offset_end = ocount;
md->offset_max = (2*ocount)/3;
md->offset_overflow = FALSE;
md->capture_last = -1;
/* Compute the minimum number of offsets that we need to reset each time. Doing
this makes a huge difference to execution time when there aren't many brackets
in the pattern. */
resetcount = 2 + re->top_bracket * 2;
if (resetcount > offsetcount) resetcount = ocount;
/* Reset the working variable associated with each extraction. These should
never be used unless previously set, but they get saved and restored, and so we
initialize them to avoid reading uninitialized locations. */
if (md->offset_vector != NULL)
{
register int *iptr = md->offset_vector + ocount;
register int *iend = iptr - resetcount/2 + 1;
while (--iptr >= iend) *iptr = -1;
}
/* Set up the first character to match, if available. The first_byte value is
never set for an anchored regular expression, but the anchoring may be forced
at run time, so we have to test for anchoring. The first char may be unset for
an unanchored pattern, of course. If there's no first char and the pattern was
studied, there may be a bitmap of possible first characters. */
if (!anchored)
{
if ((re->flags & PCRE_FIRSTSET) != 0)
{
first_byte = re->first_byte & 255;
if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
first_byte = md->lcc[first_byte];
}
else
if (!startline && study != NULL &&
(study->options & PCRE_STUDY_MAPPED) != 0)
start_bits = study->start_bits;
}
/* For anchored or unanchored matches, there may be a "last known required
character" set. */
if ((re->flags & PCRE_REQCHSET) != 0)
{
req_byte = re->req_byte & 255;
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
}
/* ==========================================================================*/
/* Loop for handling unanchored repeated matching attempts; for anchored regexs
the loop runs just once. */
for(;;)
{
USPTR save_end_subject = end_subject;
USPTR new_start_match;
/* Reset the maximum number of extractions we might see. */
if (md->offset_vector != NULL)
{
register int *iptr = md->offset_vector;
register int *iend = iptr + resetcount;
while (iptr < iend) *iptr++ = -1;
}
/* Advance to a unique first char if possible. If firstline is TRUE, the
start of the match is constrained to the first line of a multiline string.
That is, the match must be before or at the first newline. Implement this by
temporarily adjusting end_subject so that we stop scanning at a newline. If
the match fails at the newline, later code breaks this loop. */
if (firstline)
{
USPTR t = start_match;
while (t < md->end_subject && !IS_NEWLINE(t)) t++;
end_subject = t;
}
/* Now test for a unique first byte */
if (first_byte >= 0)
{
if (first_byte_caseless)
while (start_match < end_subject &&
md->lcc[*start_match] != first_byte)
{ NEXTCHAR(start_match); }
else
while (start_match < end_subject && *start_match != first_byte)
{ NEXTCHAR(start_match); }
}
/* Or to just after a linebreak for a multiline match if possible */
else if (startline)
{
if (start_match > md->start_subject + start_offset)
{
while (start_match <= end_subject && !WAS_NEWLINE(start_match))
{ NEXTCHAR(start_match); }
/* If we have just passed a CR and the newline option is ANY or ANYCRLF,
and we are now at a LF, advance the match position by one more character.
*/
if (start_match[-1] == '\r' &&
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
start_match < end_subject &&
*start_match == '\n')
start_match++;
}
}
/* Or to a non-unique first char after study */
else if (start_bits != NULL)
{
while (start_match < end_subject)
{
register unsigned int c = *start_match;
if ((start_bits[c/8] & (1 << (c&7))) == 0)
{ NEXTCHAR(start_match); }
else break;
}
}
/* Restore fudged end_subject */
end_subject = save_end_subject;
#ifdef DEBUG /* Sigh. Some compilers never learn. */
printf(">>>> Match against: ");
pchars(start_match, end_subject - start_match, TRUE, md);
printf("\n");
#endif
/* If req_byte is set, we know that that character must appear in the subject
for the match to succeed. If the first character is set, req_byte must be
later in the subject; otherwise the test starts at the match point. This
optimization can save a huge amount of backtracking in patterns with nested
unlimited repeats that aren't going to match. Writing separate code for
cased/caseless versions makes it go faster, as does using an autoincrement
and backing off on a match.
HOWEVER: when the subject string is very, very long, searching to its end can
take a long time, and give bad performance on quite ordinary patterns. This
showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
string... so we don't do this when the string is sufficiently long.
ALSO: this processing is disabled when partial matching is requested.
*/
if (req_byte >= 0 &&
end_subject - start_match < REQ_BYTE_MAX &&
!md->partial)
{
register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
/* We don't need to repeat the search if we haven't yet reached the
place we found it at last time. */
if (p > req_byte_ptr)
{
if (req_byte_caseless)
{
while (p < end_subject)
{
register int pp = *p++;
if (pp == req_byte || pp == req_byte2) { p--; break; }
}
}
else
{
while (p < end_subject)
{
if (*p++ == req_byte) { p--; break; }
}
}
/* If we can't find the required character, break the matching loop,
forcing a match failure. */
if (p >= end_subject)
{
rc = MATCH_NOMATCH;
break;
}
/* If we have found the required character, save the point where we
found it, so that we don't search again next time round the loop if
the start hasn't passed this character yet. */
req_byte_ptr = p;
}
}
/* OK, we can now run the match. */
md->start_match_ptr = start_match;
md->match_call_count = 0;
rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
switch(rc)
{
/* NOMATCH and PRUNE advance by one character. THEN at this level acts
exactly like PRUNE. */
case MATCH_NOMATCH:
case MATCH_PRUNE:
case MATCH_THEN:
new_start_match = start_match + 1;
#ifdef SUPPORT_UTF8
if (utf8)
while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
new_start_match++;
#endif
break;
/* SKIP passes back the next starting point explicitly. */
case MATCH_SKIP:
new_start_match = md->start_match_ptr;
break;
/* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
case MATCH_COMMIT:
rc = MATCH_NOMATCH;
goto ENDLOOP;
/* Any other return is some kind of error. */
default:
goto ENDLOOP;
}
/* Control reaches here for the various types of "no match at this point"
result. Reset the code to MATCH_NOMATCH for subsequent checking. */
rc = MATCH_NOMATCH;
/* If PCRE_FIRSTLINE is set, the match must happen before or at the first
newline in the subject (though it may continue over the newline). Therefore,
if we have just failed to match, starting at a newline, do not continue. */
if (firstline && IS_NEWLINE(start_match)) break;
/* Advance to new matching position */
start_match = new_start_match;
/* Break the loop if the pattern is anchored or if we have passed the end of
the subject. */
if (anchored || start_match > end_subject) break;
/* If we have just passed a CR and we are now at a LF, and the pattern does
not contain any explicit matches for \r or \n, and the newline option is CRLF
or ANY or ANYCRLF, advance the match position by one more character. */
if (start_match[-1] == '\r' &&
start_match < end_subject &&
*start_match == '\n' &&
(re->flags & PCRE_HASCRORLF) == 0 &&
(md->nltype == NLTYPE_ANY ||
md->nltype == NLTYPE_ANYCRLF ||
md->nllen == 2))
start_match++;
} /* End of for(;;) "bumpalong" loop */
/* ==========================================================================*/
/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
conditions is true:
(1) The pattern is anchored or the match was failed by (*COMMIT);
(2) We are past the end of the subject;
(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
this option requests that a match occur at or before the first newline in
the subject.
When we have a match and the offset vector is big enough to deal with any
backreferences, captured substring offsets will already be set up. In the case
where we had to get some local store to hold offsets for backreference
processing, copy those that we can. In this case there need not be overflow if
certain parts of the pattern were not used, even though there are more
capturing parentheses than vector slots. */
ENDLOOP:
if (rc == MATCH_MATCH)
{
if (using_temporary_offsets)
{
if (offsetcount >= 4)
{
memcpy(offsets + 2, md->offset_vector + 2,
(offsetcount - 2) * sizeof(int));
DPRINTF(("Copied offsets from temporary memory\n"));
}
if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
DPRINTF(("Freeing temporary memory\n"));
(pcre_free)(md->offset_vector);
}
/* Set the return code to the number of captured strings, or 0 if there are
too many to fit into the vector. */
rc = md->offset_overflow? 0 : md->end_offset_top/2;
/* If there is space, set up the whole thing as substring 0. The value of
md->start_match_ptr might be modified if \K was encountered on the success
matching path. */
if (offsetcount < 2) rc = 0; else
{
offsets[0] = md->start_match_ptr - md->start_subject;
offsets[1] = md->end_match_ptr - md->start_subject;
}
DPRINTF((">>>> returning %d\n", rc));
return rc;
}
/* Control gets here if there has been an error, or if the overall match
attempt has failed at all permitted starting positions. */
if (using_temporary_offsets)
{
DPRINTF(("Freeing temporary memory\n"));
(pcre_free)(md->offset_vector);
}
if (rc != MATCH_NOMATCH)
{
DPRINTF((">>>> error: returning %d\n", rc));
return rc;
}
else if (md->partial && md->hitend)
{
DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
return PCRE_ERROR_PARTIAL;
}
else
{
DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
return PCRE_ERROR_NOMATCH;
}
}
| pcreexec.c | 4363 |
pcrefind.c |
Type | Function | Source | Line |
INT | _pcre_ucp_findprop(const unsigned int c, int *type_ptr, int *script_ptr)
int
_pcre_ucp_findprop(const unsigned int c, int *type_ptr, int *script_ptr)
{
int bot = 0;
int top = sizeof(ucp_table)/sizeof(cnode);
int mid;
/* The table is searched using a binary chop. You might think that using
intermediate variables to hold some of the common expressions would speed
things up, but tests with gcc 3.4.4 on Linux showed that, on the contrary, it
makes things a lot slower. */
for (;;)
{
if (top <= bot)
{
*type_ptr = ucp_Cn;
*script_ptr = ucp_Common;
return ucp_C;
}
mid = (bot + top) >> 1;
if (c == (ucp_table[mid].f0 & f0_charmask)) break;
if (c < (ucp_table[mid].f0 & f0_charmask)) top = mid;
else
{
if ((ucp_table[mid].f0 & f0_rangeflag) != 0 &&
c <= (ucp_table[mid].f0 & f0_charmask) +
(ucp_table[mid].f1 & f1_rangemask)) break;
bot = mid + 1;
}
}
/* Found an entry in the table. Set the script and detailed type values, and
return the general type. */
*script_ptr = (ucp_table[mid].f0 & f0_scriptmask) >> f0_scriptshift;
*type_ptr = (ucp_table[mid].f1 & f1_typemask) >> f1_typeshift;
return ucp_gentype[*type_ptr];
}
/*************************************************
* Search table and return other case *
*************************************************/
| pcrefind.c | 85 |
UNSIGNED INT | _pcre_ucp_othercase(const unsigned int c)
unsigned int
_pcre_ucp_othercase(const unsigned int c)
{
int bot = 0;
int top = sizeof(ucp_table)/sizeof(cnode);
int mid, offset;
/* The table is searched using a binary chop. You might think that using
intermediate variables to hold some of the common expressions would speed
things up, but tests with gcc 3.4.4 on Linux showed that, on the contrary, it
makes things a lot slower. */
for (;;)
{
if (top <= bot) return (unsigned int) -1;
mid = (bot + top) >> 1;
if (c == (ucp_table[mid].f0 & f0_charmask)) break;
if (c < (ucp_table[mid].f0 & f0_charmask)) top = mid;
else
{
if ((ucp_table[mid].f0 & f0_rangeflag) != 0 &&
c <= (ucp_table[mid].f0 & f0_charmask) +
(ucp_table[mid].f1 & f1_rangemask)) break;
bot = mid + 1;
}
}
/* Found an entry in the table. Return NOTACHAR for a range entry. Otherwise
return the other case if there is one, else NOTACHAR. */
if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return NOTACHAR;
offset = ucp_table[mid].f1 & f1_casemask;
if ((offset & f1_caseneg) != 0) offset |= f1_caseneg;
return (offset == 0)? NOTACHAR : c + offset;
}
| pcrefind.c | 141 |
pcrefinf.c |
Type | Function | Source | Line |
PCRE_EXP_DEFN INT | pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what, void *where)
PCRE_EXP_DEFN int
pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
void *where)
{
real_pcre internal_re;
pcre_study_data internal_study;
const real_pcre *re = (const real_pcre *)argument_re;
const pcre_study_data *study = NULL;
if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
study = (const pcre_study_data *)extra_data->study_data;
if (re->magic_number != MAGIC_NUMBER)
{
re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
if (re == NULL) return PCRE_ERROR_BADMAGIC;
if (study != NULL) study = &internal_study;
}
switch (what)
{
case PCRE_INFO_OPTIONS:
*((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
break;
case PCRE_INFO_SIZE:
*((size_t *)where) = re->size;
break;
case PCRE_INFO_STUDYSIZE:
*((size_t *)where) = (study == NULL)? 0 : study->size;
break;
case PCRE_INFO_CAPTURECOUNT:
*((int *)where) = re->top_bracket;
break;
case PCRE_INFO_BACKREFMAX:
*((int *)where) = re->top_backref;
break;
case PCRE_INFO_FIRSTBYTE:
*((int *)where) =
((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
break;
/* Make sure we pass back the pointer to the bit vector in the external
block, not the internal copy (with flipped integer fields). */
case PCRE_INFO_FIRSTTABLE:
*((const uschar **)where) =
(study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
break;
case PCRE_INFO_LASTLITERAL:
*((int *)where) =
((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1;
break;
case PCRE_INFO_NAMEENTRYSIZE:
*((int *)where) = re->name_entry_size;
break;
case PCRE_INFO_NAMECOUNT:
*((int *)where) = re->name_count;
break;
case PCRE_INFO_NAMETABLE:
*((const uschar **)where) = (const uschar *)re + re->name_table_offset;
break;
case PCRE_INFO_DEFAULT_TABLES:
*((const uschar **)where) = (const uschar *)(_pcre_default_tables);
break;
case PCRE_INFO_OKPARTIAL:
*((int *)where) = (re->flags & PCRE_NOPARTIAL) == 0;
break;
case PCRE_INFO_JCHANGED:
*((int *)where) = (re->flags & PCRE_JCHANGED) != 0;
break;
case PCRE_INFO_HASCRORLF:
*((int *)where) = (re->flags & PCRE_HASCRORLF) != 0;
break;
default: return PCRE_ERROR_BADOPTION;
}
return 0;
}
| pcrefinf.c | 68 |
pcreget.c |
Type | Function | Source | Line |
INT | pcre_get_stringnumber(const pcre *code, const char *stringname)
int
pcre_get_stringnumber(const pcre *code, const char *stringname)
{
int rc;
int entrysize;
int top, bot;
uschar *nametable;
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
return rc;
if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
return rc;
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
return rc;
bot = 0;
while (top > bot)
{
int mid = (top + bot) / 2;
uschar *entry = nametable + entrysize*mid;
int c = strcmp(stringname, (char *)(entry + 2));
if (c == 0) return (entry[0] << 8) + entry[1];
if (c > 0) bot = mid + 1; else top = mid;
}
return PCRE_ERROR_NOSUBSTRING;
}
/*************************************************
* Find (multiple) entries for named string *
*************************************************/
| pcreget.c | 68 |
INT | pcre_get_stringtable_entries(const pcre *code, const char *stringname, char **firstptr, char **lastptr)
int
pcre_get_stringtable_entries(const pcre *code, const char *stringname,
char **firstptr, char **lastptr)
{
int rc;
int entrysize;
int top, bot;
uschar *nametable, *lastentry;
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
return rc;
if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
return rc;
if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
return rc;
lastentry = nametable + entrysize * (top - 1);
bot = 0;
while (top > bot)
{
int mid = (top + bot) / 2;
uschar *entry = nametable + entrysize*mid;
int c = strcmp(stringname, (char *)(entry + 2));
if (c == 0)
{
uschar *first = entry;
uschar *last = entry;
while (first > nametable)
{
if (strcmp(stringname, (char *)(first - entrysize + 2)) != 0) break;
first -= entrysize;
}
while (last < lastentry)
{
if (strcmp(stringname, (char *)(last + entrysize + 2)) != 0) break;
last += entrysize;
}
*firstptr = (char *)first;
*lastptr = (char *)last;
return entrysize;
}
if (c > 0) bot = mid + 1; else top = mid;
}
return PCRE_ERROR_NOSUBSTRING;
}
/*************************************************
* Find first set of multiple named strings *
*************************************************/
| pcreget.c | 117 |
STATIC INT | get_first_set(const pcre *code, const char *stringname, int *ovector)
static int
get_first_set(const pcre *code, const char *stringname, int *ovector)
{
const real_pcre *re = (const real_pcre *)code;
int entrysize;
char *first, *last;
uschar *entry;
if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0)
return pcre_get_stringnumber(code, stringname);
entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last);
if (entrysize <= 0) return entrysize;
for (entry = (uschar *)first; entry <= (uschar *)last; entry += entrysize)
{
int n = (entry[0] << 8) + entry[1];
if (ovector[n*2] >= 0) return n;
}
return (first[0] << 8) + first[1];
}
/*************************************************
* Copy captured string to given buffer *
*************************************************/
| pcreget.c | 185 |
INT | pcre_copy_substring(const char *subject, int *ovector, int stringcount, int stringnumber, char *buffer, int size)
int
pcre_copy_substring(const char *subject, int *ovector, int stringcount,
int stringnumber, char *buffer, int size)
{
int yield;
if (stringnumber < 0 || stringnumber >= stringcount)
return PCRE_ERROR_NOSUBSTRING;
stringnumber *= 2;
yield = ovector[stringnumber+1] - ovector[stringnumber];
if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
memcpy(buffer, subject + ovector[stringnumber], yield);
buffer[yield] = 0;
return yield;
}
/*************************************************
* Copy named captured string to given buffer *
*************************************************/
| pcreget.c | 234 |
INT | pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector, int stringcount, const char *stringname, char *buffer, int size)
int
pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector,
int stringcount, const char *stringname, char *buffer, int size)
{
int n = get_first_set(code, stringname, ovector);
if (n <= 0) return n;
return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size);
}
/*************************************************
* Copy all captured strings to new store *
*************************************************/
| pcreget.c | 279 |
INT | pcre_get_substring_list(const char *subject, int *ovector, int stringcount, const char ***listptr)
int
pcre_get_substring_list(const char *subject, int *ovector, int stringcount,
const char ***listptr)
{
int i;
int size = sizeof(char *);
int double_count = stringcount * 2;
char **stringlist;
char *p;
for (i = 0; i < double_count; i += 2)
size += sizeof(char *) + ovector[i+1] - ovector[i] + 1;
stringlist = (char **)(pcre_malloc)(size);
if (stringlist == NULL) return PCRE_ERROR_NOMEMORY;
*listptr = (const char **)stringlist;
p = (char *)(stringlist + stringcount + 1);
for (i = 0; i < double_count; i += 2)
{
int len = ovector[i+1] - ovector[i];
memcpy(p, subject + ovector[i], len);
*stringlist++ = p;
p += len;
*p++ = 0;
}
*stringlist = NULL;
return 0;
}
/*************************************************
* Free store obtained by get_substring_list *
*************************************************/
| pcreget.c | 311 |
VOID | pcre_free_substring_list(const char **pointer)
void
pcre_free_substring_list(const char **pointer)
{
(pcre_free)((void *)pointer);
}
/*************************************************
* Copy captured string to new store *
*************************************************/
| pcreget.c | 356 |
INT | pcre_get_substring(const char *subject, int *ovector, int stringcount, int stringnumber, const char **stringptr)
int
pcre_get_substring(const char *subject, int *ovector, int stringcount,
int stringnumber, const char **stringptr)
{
int yield;
char *substring;
if (stringnumber < 0 || stringnumber >= stringcount)
return PCRE_ERROR_NOSUBSTRING;
stringnumber *= 2;
yield = ovector[stringnumber+1] - ovector[stringnumber];
substring = (char *)(pcre_malloc)(yield + 1);
if (substring == NULL) return PCRE_ERROR_NOMEMORY;
memcpy(substring, subject + ovector[stringnumber], yield);
substring[yield] = 0;
*stringptr = substring;
return yield;
}
/*************************************************
* Copy named captured string to new store *
*************************************************/
| pcreget.c | 389 |
INT | pcre_get_named_substring(const pcre *code, const char *subject, int *ovector, int stringcount, const char *stringname, const char **stringptr)
int
pcre_get_named_substring(const pcre *code, const char *subject, int *ovector,
int stringcount, const char *stringname, const char **stringptr)
{
int n = get_first_set(code, stringname, ovector);
if (n <= 0) return n;
return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
}
/*************************************************
* Free store obtained by get_substring *
*************************************************/
| pcreget.c | 436 |
VOID | pcre_free_substring(const char *pointer)
void
pcre_free_substring(const char *pointer)
{
(pcre_free)((void *)pointer);
}
| pcreget.c | 459 |
pcreinfo.c |
Type | Function | Source | Line |
PCRE_EXP_DEFN INT | pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
PCRE_EXP_DEFN int
pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
{
real_pcre internal_re;
const real_pcre *re = (const real_pcre *)argument_re;
if (re == NULL) return PCRE_ERROR_NULL;
if (re->magic_number != MAGIC_NUMBER)
{
re = _pcre_try_flipped(re, &internal_re, NULL, NULL);
if (re == NULL) return PCRE_ERROR_BADMAGIC;
}
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
if (first_byte != NULL)
*first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
return re->top_bracket;
}
| pcreinfo.c | 75 |
pcremktb.c |
Type | Function | Source | Line |
CONST UNSIGNED CHAR * | pcre_maketables(void)
const unsigned char *
pcre_maketables(void)
{
unsigned char *yield, *p;
int i;
#ifndef DFTABLES
yield = (unsigned char*)(pcre_malloc)(tables_length);
#else
yield = (unsigned char*)malloc(tables_length);
#endif
if (yield == NULL) return NULL;
p = yield;
/* First comes the lower casing table */
for (i = 0; i < 256; i++) *p++ = tolower(i);
/* Next the case-flipping table */
for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
/* Then the character class tables. Don't try to be clever and save effort on
exclusive ones - in some locales things may be different. Note that the table
for "space" includes everything "isspace" gives, including VT in the default
locale. This makes it work for the POSIX class [:space:]. Note also that it is
possible for a character to be alnum or alpha without being lower or upper,
such as "male and female ordinals" (\xAA and \xBA) in the fr_FR locale (at
least under Debian Linux's locales as of 12/2005). So we must test for alnum
specially. */
memset(p, 0, cbit_length);
for (i = 0; i < 256; i++)
{
if (isdigit(i)) p[cbit_digit + i/8] |= 1 << (i&7);
if (isupper(i)) p[cbit_upper + i/8] |= 1 << (i&7);
if (islower(i)) p[cbit_lower + i/8] |= 1 << (i&7);
if (isalnum(i)) p[cbit_word + i/8] |= 1 << (i&7);
if (i == '_') p[cbit_word + i/8] |= 1 << (i&7);
if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7);
if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7);
if (isgraph(i)) p[cbit_graph + i/8] |= 1 << (i&7);
if (isprint(i)) p[cbit_print + i/8] |= 1 << (i&7);
if (ispunct(i)) p[cbit_punct + i/8] |= 1 << (i&7);
if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1 << (i&7);
}
p += cbit_length;
/* Finally, the character type table. In this, we exclude VT from the white
space chars, because Perl doesn't recognize it as such for \s and for comments
within regexes. */
for (i = 0; i < 256; i++)
{
int x = 0;
if (i != 0x0b && isspace(i)) x += ctype_space;
if (isalpha(i)) x += ctype_letter;
if (isdigit(i)) x += ctype_digit;
if (isxdigit(i)) x += ctype_xdigit;
if (isalnum(i) || i == '_') x += ctype_word;
/* Note: strchr includes the terminating zero in the characters it considers.
In this instance, that is ok because we want binary zero to be flagged as a
meta-character, which in this sense is any character that terminates a run
of data characters. */
if (strchr("\\*+?{^.$|()[", i) != 0) x += ctype_meta;
*p++ = x;
}
return yield;
}
| pcremktb.c | 69 |
pcrenewl.c |
Type | Function | Source | Line |
BOOL | _pcre_is_newline(const uschar *ptr, int type, const uschar *endptr, int *lenptr, BOOL utf8)
BOOL
_pcre_is_newline(const uschar *ptr, int type, const uschar *endptr,
int *lenptr, BOOL utf8)
{
int c;
if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
if (type == NLTYPE_ANYCRLF) switch(c)
{
case 0x000a: *lenptr = 1; return TRUE; /* LF */
case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
return TRUE; /* CR */
default: return FALSE;
}
/* NLTYPE_ANY */
else switch(c)
{
case 0x000a: /* LF */
case 0x000b: /* VT */
case 0x000c: *lenptr = 1; return TRUE; /* FF */
case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
return TRUE; /* CR */
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
default: return FALSE;
}
}
/*************************************************
* Check for newline at previous position *
*************************************************/
| pcrenewl.c | 75 |
BOOL | _pcre_was_newline(const uschar *ptr, int type, const uschar *startptr, int *lenptr, BOOL utf8)
BOOL
_pcre_was_newline(const uschar *ptr, int type, const uschar *startptr,
int *lenptr, BOOL utf8)
{
int c;
ptr--;
#ifdef SUPPORT_UTF8
if (utf8)
{
BACKCHAR(ptr);
GETCHAR(c, ptr);
}
else c = *ptr;
#else /* no UTF-8 support */
c = *ptr;
#endif /* SUPPORT_UTF8 */
if (type == NLTYPE_ANYCRLF) switch(c)
{
case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1;
return TRUE; /* LF */
case 0x000d: *lenptr = 1; return TRUE; /* CR */
default: return FALSE;
}
else switch(c)
{
case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1;
return TRUE; /* LF */
case 0x000b: /* VT */
case 0x000c: /* FF */
case 0x000d: *lenptr = 1; return TRUE; /* CR */
case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
default: return FALSE;
}
}
| pcrenewl.c | 125 |
pcreoutf.c |
Type | Function | Source | Line |
INT | _pcre_ord2utf8(int cvalue, uschar *buffer)
int
_pcre_ord2utf8(int cvalue, uschar *buffer)
{
#ifdef SUPPORT_UTF8
register int i, j;
for (i = 0; i < _pcre_utf8_table1_size; i++)
if (cvalue <= _pcre_utf8_table1[i]) break;
buffer += i;
for (j = i; j > 0; j--)
{
*buffer-- = 0x80 | (cvalue & 0x3f);
cvalue >>= 6;
}
*buffer = _pcre_utf8_table2[i] | cvalue;
return i + 1;
#else
/* pacify warnings */
(void)(cvalue);
(void)(buffer);
return 0; /* Keep compiler happy; this function won't ever be */
#endif /* called when SUPPORT_UTF8 is not defined. */
}
| pcreoutf.c | 65 |
pcrerefc.c |
Type | Function | Source | Line |
PCRE_EXP_DEFN INT | pcre_refcount(pcre *argument_re, int adjust)
PCRE_EXP_DEFN int
pcre_refcount(pcre *argument_re, int adjust)
{
real_pcre *re = (real_pcre *)argument_re;
if (re == NULL) return PCRE_ERROR_NULL;
re->ref_count = (-adjust > re->ref_count)? 0 :
(adjust + re->ref_count > 65535)? 65535 :
re->ref_count + adjust;
return re->ref_count;
}
| pcrerefc.c | 71 |
pcrestud.c |
Type | Function | Source | Line |
STATIC VOID | set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
static void
set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
{
start_bits[c/8] |= (1 << (c&7));
if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
}
/*************************************************
* Create bitmap of starting bytes *
*************************************************/
| pcrestud.c | 73 |
STATIC INT | set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, BOOL utf8, compile_data *cd)
static int
set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
BOOL utf8, compile_data *cd)
{
register int c;
int yield = SSB_DONE;
#if 0
/* ========================================================================= */
/* The following comment and code was inserted in January 1999. In May 2006,
when it was observed to cause compiler warnings about unused values, I took it
out again. If anybody is still using OS/2, they will have to put it back
manually. */
/* This next statement and the later reference to dummy are here in order to
trick the optimizer of the IBM C compiler for OS/2 into generating correct
code. Apparently IBM isn't going to fix the problem, and we would rather not
disable optimization (in this module it actually makes a big difference, and
the pcre module can use all the optimization it can get). */
volatile int dummy;
/* ========================================================================= */
#endif
do
{
const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
BOOL try_next = TRUE;
while (try_next) /* Loop for items in this branch */
{
int rc;
switch(*tcode)
{
/* Fail if we reach something we don't understand */
default:
return SSB_FAIL;
/* If we hit a bracket or a positive lookahead assertion, recurse to set
bits from within the subpattern. If it can't find anything, we have to
give up. If it finds some mandatory character(s), we are done for this
branch. Otherwise, carry on scanning after the subpattern. */
case OP_BRA:
case OP_SBRA:
case OP_CBRA:
case OP_SCBRA:
case OP_ONCE:
case OP_ASSERT:
rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
if (rc == SSB_FAIL) return SSB_FAIL;
if (rc == SSB_DONE) try_next = FALSE; else
{
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
}
break;
/* If we hit ALT or KET, it means we haven't found anything mandatory in
this branch, though we might have found something optional. For ALT, we
continue with the next alternative, but we have to arrange that the final
result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
return SSB_CONTINUE: if this is the top level, that indicates failure,
but after a nested subpattern, it causes scanning to continue. */
case OP_ALT:
yield = SSB_CONTINUE;
try_next = FALSE;
break;
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
return SSB_CONTINUE;
/* Skip over callout */
case OP_CALLOUT:
tcode += 2 + 2*LINK_SIZE;
break;
/* Skip over lookbehind and negative lookahead assertions */
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
break;
/* Skip over an option setting, changing the caseless flag */
case OP_OPT:
caseless = (tcode[1] & PCRE_CASELESS) != 0;
tcode += 2;
break;
/* BRAZERO does the bracket, but carries on. */
case OP_BRAZERO:
case OP_BRAMINZERO:
if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
return SSB_FAIL;
/* =========================================================================
See the comment at the head of this function concerning the next line,
which was an old fudge for the benefit of OS/2.
dummy = 1;
========================================================================= */
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
break;
/* SKIPZERO skips the bracket. */
case OP_SKIPZERO:
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
break;
/* Single-char * or ? sets the bit and tries the next item */
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
set_bit(start_bits, tcode[1], caseless, cd);
tcode += 2;
#ifdef SUPPORT_UTF8
if (utf8 && tcode[-1] >= 0xc0)
tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
#endif
break;
/* Single-char upto sets the bit and tries the next */
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
set_bit(start_bits, tcode[3], caseless, cd);
tcode += 4;
#ifdef SUPPORT_UTF8
if (utf8 && tcode[-1] >= 0xc0)
tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
#endif
break;
/* At least one single char sets the bit and stops */
case OP_EXACT: /* Fall through */
tcode += 2;
case OP_CHAR:
case OP_CHARNC:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
set_bit(start_bits, tcode[1], caseless, cd);
try_next = FALSE;
break;
/* Single character type sets the bits and stops */
case OP_NOT_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
try_next = FALSE;
break;
case OP_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
case OP_NOT_WHITESPACE:
for (c = 0; c < 32; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= ~d;
}
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
case OP_WHITESPACE:
for (c = 0; c < 32; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= d;
}
try_next = FALSE;
break;
case OP_NOT_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
try_next = FALSE;
break;
case OP_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
try_next = FALSE;
break;
/* One or more character type fudges the pointer and restarts, knowing
it will hit a single character type and stop there. */
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
tcode++;
break;
case OP_TYPEEXACT:
tcode += 3;
break;
/* Zero or more repeats of character types set the bits and then
try again. */
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
tcode += 2; /* Fall through */
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPOSSTAR:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSQUERY:
switch(tcode[1])
{
case OP_ANY:
case OP_ALLANY:
return SSB_FAIL;
case OP_NOT_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
break;
case OP_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
case OP_NOT_WHITESPACE:
for (c = 0; c < 32; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= ~d;
}
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
case OP_WHITESPACE:
for (c = 0; c < 32; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= d;
}
break;
case OP_NOT_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
break;
case OP_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
break;
}
tcode += 2;
break;
/* Character class where all the information is in a bit map: set the
bits and either carry on or not, according to the repeat count. If it was
a negative class, and we are operating with UTF-8 characters, any byte
with a value >= 0xc4 is a potentially valid starter because it starts a
character with a value > 255. */
case OP_NCLASS:
#ifdef SUPPORT_UTF8
if (utf8)
{
start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
}
#endif
/* Fall through */
case OP_CLASS:
{
tcode++;
/* In UTF-8 mode, the bits in a bit map correspond to character
values, not to byte values. However, the bit map we are constructing is
for byte values. So we have to do a conversion for characters whose
value is > 127. In fact, there are only two possible starting bytes for
characters in the range 128 - 255. */
#ifdef SUPPORT_UTF8
if (utf8)
{
for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
for (c = 128; c < 256; c++)
{
if ((tcode[c/8] && (1 << (c&7))) != 0)
{
int d = (c >> 6) | 0xc0; /* Set bit for this starter */
start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
}
}
}
/* In non-UTF-8 mode, the two bit maps are completely compatible. */
else
#endif
{
for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
}
/* Advance past the bit map, and act on what follows */
tcode += 32;
switch (*tcode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
tcode++;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
else try_next = FALSE;
break;
default:
try_next = FALSE;
break;
}
}
break; /* End of bitmap class handling */
} /* End of switch */
} /* End of try_next loop */
code += GET(code, 1); /* Advance to next branch */
}
while (*code == OP_ALT);
return yield;
}
/*************************************************
* Study a compiled expression *
*************************************************/
| pcrestud.c | 107 |
PCRE_EXP_DEFN PCRE_EXTRA * | pcre_study(const pcre *external_re, int options, const char **errorptr)
PCRE_EXP_DEFN pcre_extra *
pcre_study(const pcre *external_re, int options, const char **errorptr)
{
uschar start_bits[32];
pcre_extra *extra;
pcre_study_data *study;
const uschar *tables;
uschar *code;
compile_data compile_block;
const real_pcre *re = (const real_pcre *)external_re;
*errorptr = NULL;
if (re == NULL || re->magic_number != MAGIC_NUMBER)
{
*errorptr = "argument is not a compiled regular expression";
return NULL;
}
if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
{
*errorptr = "unknown or incorrect option bit(s) set";
return NULL;
}
code = (uschar *)re + re->name_table_offset +
(re->name_count * re->name_entry_size);
/* For an anchored pattern, or an unanchored pattern that has a first char, or
a multiline pattern that matches only at "line starts", no further processing
at present. */
if ((re->options & PCRE_ANCHORED) != 0 ||
(re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
return NULL;
/* Set the character tables in the block that is passed around */
tables = re->tables;
if (tables == NULL)
(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
(void *)(&tables));
compile_block.lcc = tables + lcc_offset;
compile_block.fcc = tables + fcc_offset;
compile_block.cbits = tables + cbits_offset;
compile_block.ctypes = tables + ctypes_offset;
/* See if we can find a fixed set of initial characters for the pattern. */
memset(start_bits, 0, 32 * sizeof(uschar));
if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
(re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
the latter, which is pointed to by the former, which may also get additional
data set later by the calling program. At the moment, the size of
pcre_study_data is fixed. We nevertheless save it in a field for returning via
the pcre_fullinfo() function so that if it becomes variable in the future, we
don't have to change that code. */
extra = (pcre_extra *)(pcre_malloc)
(sizeof(pcre_extra) + sizeof(pcre_study_data));
if (extra == NULL)
{
*errorptr = "failed to get memory";
return NULL;
}
study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
extra->flags = PCRE_EXTRA_STUDY_DATA;
extra->study_data = study;
study->size = sizeof(pcre_study_data);
study->options = PCRE_STUDY_MAPPED;
memcpy(study->start_bits, start_bits, sizeof(start_bits));
return extra;
}
| pcrestud.c | 506 |
pcretryf.c |
Type | Function | Source | Line |
STATIC UNSIGNED LONG INT | byteflip(unsigned long int value, int n)
static unsigned long int
byteflip(unsigned long int value, int n)
{
if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
return ((value & 0x000000ff) << 24) |
((value & 0x0000ff00) << 8) |
((value & 0x00ff0000) >> 8) |
((value & 0xff000000) >> 24);
}
/*************************************************
* Test for a byte-flipped compiled regex *
*************************************************/
| pcretryf.c | 69 |
REAL_PCRE * | _pcre_try_flipped(const real_pcre *re, real_pcre *internal_re, const pcre_study_data *study, pcre_study_data *internal_study)
real_pcre *
_pcre_try_flipped(const real_pcre *re, real_pcre *internal_re,
const pcre_study_data *study, pcre_study_data *internal_study)
{
if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
return NULL;
*internal_re = *re; /* To copy other fields */
internal_re->size = byteflip(re->size, sizeof(re->size));
internal_re->options = byteflip(re->options, sizeof(re->options));
internal_re->flags = (pcre_uint16)byteflip(re->flags, sizeof(re->flags));
internal_re->top_bracket =
(pcre_uint16)byteflip(re->top_bracket, sizeof(re->top_bracket));
internal_re->top_backref =
(pcre_uint16)byteflip(re->top_backref, sizeof(re->top_backref));
internal_re->first_byte =
(pcre_uint16)byteflip(re->first_byte, sizeof(re->first_byte));
internal_re->req_byte =
(pcre_uint16)byteflip(re->req_byte, sizeof(re->req_byte));
internal_re->name_table_offset =
(pcre_uint16)byteflip(re->name_table_offset, sizeof(re->name_table_offset));
internal_re->name_entry_size =
(pcre_uint16)byteflip(re->name_entry_size, sizeof(re->name_entry_size));
internal_re->name_count =
(pcre_uint16)byteflip(re->name_count, sizeof(re->name_count));
if (study != NULL)
{
*internal_study = *study; /* To copy other fields */
internal_study->size = byteflip(study->size, sizeof(study->size));
internal_study->options = byteflip(study->options, sizeof(study->options));
}
return internal_re;
}
| pcretryf.c | 101 |
pcrever.c |
Type | Function | Source | Line |
PCRE_EXP_DEFN CONST CHAR * | pcre_version(void)
PCRE_EXP_DEFN const char *
pcre_version(void)
{
return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)?
XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) :
XSTRING(PCRE_MAJOR.PCRE_MINOR) XSTRING(PCRE_PRERELEASE PCRE_DATE);
}
| pcrever.c | 82 |
pcrevutf.c |
Type | Function | Source | Line |
INT | _pcre_valid_utf8(const uschar *string, int length)
int
_pcre_valid_utf8(const uschar *string, int length)
{
#ifdef SUPPORT_UTF8
register const uschar *p;
if (length < 0)
{
for (p = string; *p != 0; p++);
length = p - string;
}
for (p = string; length-- > 0; p++)
{
register int ab;
register int c = *p;
if (c < 128) continue;
if (c < 0xc0) return p - string;
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
if (length < ab || ab > 3) return p - string;
length -= ab;
/* Check top bits in the second byte */
if ((*(++p) & 0xc0) != 0x80) return p - string;
/* Check for overlong sequences for each different length, and for the
excluded range 0xd000 to 0xdfff. */
switch (ab)
{
/* Check for xx00 000x (overlong sequence) */
case 1:
if ((c & 0x3e) == 0) return p - string;
continue; /* We know there aren't any more bytes to check */
/* Check for 1110 0000, xx0x xxxx (overlong sequence) or
1110 1101, 1010 xxxx (0xd000 - 0xdfff) */
case 2:
if ((c == 0xe0 && (*p & 0x20) == 0) ||
(c == 0xed && *p >= 0xa0))
return p - string;
break;
/* Check for 1111 0000, xx00 xxxx (overlong sequence) or
greater than 0x0010ffff (f4 8f bf bf) */
case 3:
if ((c == 0xf0 && (*p & 0x30) == 0) ||
(c > 0xf4 ) ||
(c == 0xf4 && *p > 0x8f))
return p - string;
break;
#if 0
/* These cases can no longer occur, as we restrict to a maximum of four
bytes nowadays. Leave the code here in case we ever want to add an option
for longer sequences. */
/* Check for 1111 1000, xx00 0xxx */
case 4:
if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
break;
/* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
case 5:
if (c == 0xfe || c == 0xff ||
(c == 0xfc && (*p & 0x3c) == 0)) return p - string;
break;
#endif
}
/* Check for valid bytes after the 2nd, if any; all must start 10 */
while (--ab > 0)
{
if ((*(++p) & 0xc0) != 0x80) return p - string;
}
}
#else
/* pacify warnings */
(void)(string);
(void)(length);
#endif
return -1;
}
| pcrevutf.c | 77 |
pcrexcls.c |
Type | Function | Source | Line |
BOOL | _pcre_xclass(int c, const uschar *data)
BOOL
_pcre_xclass(int c, const uschar *data)
{
int t;
BOOL negated = (*data & XCL_NOT) != 0;
/* Character values < 256 are matched against a bitmap, if one is present. If
not, we still carry on, because there may be ranges that start below 256 in the
additional data. */
if (c < 256)
{
if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
return !negated; /* char found */
}
/* First skip the bit map if present. Then match against the list of Unicode
properties or large chars or ranges that end with a large char. We won't ever
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
if ((*data++ & XCL_MAP) != 0) data += 32;
while ((t = *data++) != XCL_END)
{
int x, y;
if (t == XCL_SINGLE)
{
GETCHARINC(x, data);
if (c == x) return !negated;
}
else if (t == XCL_RANGE)
{
GETCHARINC(x, data);
GETCHARINC(y, data);
if (c >= x && c <= y) return !negated;
}
#ifdef SUPPORT_UCP
else /* XCL_PROP & XCL_NOTPROP */
{
int chartype, script;
int category = _pcre_ucp_findprop(c, &chartype, &script);
switch(*data)
{
case PT_ANY:
if (t == XCL_PROP) return !negated;
break;
case PT_LAMP:
if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) ==
(t == XCL_PROP)) return !negated;
break;
case PT_GC:
if ((data[1] == category) == (t == XCL_PROP)) return !negated;
break;
case PT_PC:
if ((data[1] == chartype) == (t == XCL_PROP)) return !negated;
break;
case PT_SC:
if ((data[1] == script) == (t == XCL_PROP)) return !negated;
break;
/* This should never occur, but compilers may mutter if there is no
default. */
default:
return FALSE;
}
data += 2;
}
#endif /* SUPPORT_UCP */
}
return negated; /* char did not match */
}
| pcrexcls.c | 67 |
_hbpcreg.c |
Type | Function | Source | Line |
STATIC VOID * | hb_pcre_grab( size_t size )
static void * hb_pcre_grab( size_t size )
{
return hb_xgrab( size );
}
#if 1
#include "_hbconf.h"
#endif
#include "pcreinal.h"
#ifndef VPCOMPAT
HB_EXTERN_BEGIN
PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = hb_pcre_grab;
PCRE_EXP_DATA_DEFN void (*pcre_free)(void *) = hb_xfree;
PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = hb_pcre_grab;
PCRE_EXP_DATA_DEFN void (*pcre_stack_free)(void *) = hb_xfree;
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
HB_EXTERN_END
#endif
| _hbpcreg.c | 58 |
|