fix ugly bugs in TRE regex parser

1. * in BRE is not special at the beginning of the regex or a
subexpression. this broke ncurses' build scripts.

2. \\( in BRE is a literal \ followed by a literal (, not a literal \
followed by a subexpression opener.

3. the ^ in \\(^ in BRE is a literal ^ only at the beginning of the
entire BRE. POSIX allows treating it as an anchor at the beginning of
a subexpression, but TRE's code for checking if it was at the
beginning of a subexpression was wrong, and fixing it for the sake of
supporting a non-portable usage was too much trouble when just
removing this non-portable behavior was much easier.

this patch also moved lots of the ugly logic for empty atom checking
out of the default/literal case and into new cases for the relevant
characters. this should make parsing faster and make the code smaller.
if nothing else it's a lot more readable/logical.

at some point i'd like to revisit and overhaul lots of this code...
This commit is contained in:
Rich Felker 2012-05-07 14:50:49 -04:00
parent 2d3e2a7fc1
commit d7a90b35b9

View File

@ -961,6 +961,8 @@ tre_parse(tre_parse_ctx_t *ctx)
tre_stack_t *stack = ctx->stack;
int bottom = tre_stack_num_objects(stack);
int depth = 0;
wchar_t wc;
int clen;
if (!ctx->nofirstsub)
{
@ -1155,10 +1157,9 @@ tre_parse(tre_parse_ctx_t *ctx)
{
case CHAR_LPAREN: /* parenthesized subexpression */
if (ctx->cflags & REG_EXTENDED
|| (ctx->re > ctx->re_start
&& *(ctx->re - 1) == CHAR_BACKSLASH))
if (ctx->cflags & REG_EXTENDED)
{
lparen:
depth++;
{
ctx->re++;
@ -1174,25 +1175,6 @@ tre_parse(tre_parse_ctx_t *ctx)
goto parse_literal;
break;
case CHAR_RPAREN: /* end of current subexpression */
if ((ctx->cflags & REG_EXTENDED && depth > 0)
|| (ctx->re > ctx->re_start
&& *(ctx->re - 1) == CHAR_BACKSLASH))
{
/* We were expecting an atom, but instead the current
subexpression was closed. POSIX leaves the meaning of
this to be implementation-defined. We interpret this as
an empty expression (which matches an empty string). */
result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
if (result == NULL)
return REG_ESPACE;
if (!(ctx->cflags & REG_EXTENDED))
ctx->re--;
}
else
goto parse_literal;
break;
case CHAR_LBRACKET: /* bracket expression */
ctx->re++;
status = tre_parse_bracket(ctx, &result);
@ -1203,13 +1185,14 @@ tre_parse(tre_parse_ctx_t *ctx)
case CHAR_BACKSLASH:
/* If this is "\(" or "\)" chew off the backslash and
try again. */
if (!(ctx->cflags & REG_EXTENDED)
&& (*(ctx->re + 1) == CHAR_LPAREN
|| *(ctx->re + 1) == CHAR_RPAREN))
if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
{
ctx->re++;
STACK_PUSHX(stack, int, PARSE_ATOM);
break;
goto lparen;
}
if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
{
goto empty_atom;
}
/* If a macro is used, parse the expanded macro recursively. */
@ -1369,12 +1352,9 @@ tre_parse(tre_parse_ctx_t *ctx)
break;
case CHAR_CARET: /* beginning of line assertion */
/* '^' has a special meaning everywhere in EREs, and in the
beginning of the RE and after \( is BREs. */
/* '^' has a special meaning everywhere in EREs, and at
beginning of BRE. */
if (ctx->cflags & REG_EXTENDED
|| (ctx->re - 2 >= ctx->re_start
&& *(ctx->re - 2) == CHAR_BACKSLASH
&& *(ctx->re - 1) == CHAR_LPAREN)
|| ctx->re == ctx->re_start)
{
result = tre_ast_new_literal(ctx->mem, ASSERTION,
@ -1389,10 +1369,8 @@ tre_parse(tre_parse_ctx_t *ctx)
case CHAR_DOLLAR: /* end of line assertion. */
/* '$' is special everywhere in EREs, and in the end of the
string and before \) is BREs. */
string in BREs. */
if (ctx->cflags & REG_EXTENDED
|| (*(ctx->re + 1) == CHAR_BACKSLASH
&& *(ctx->re + 2) == CHAR_RPAREN)
|| !*(ctx->re + 1))
{
result = tre_ast_new_literal(ctx->mem, ASSERTION,
@ -1405,34 +1383,27 @@ tre_parse(tre_parse_ctx_t *ctx)
goto parse_literal;
break;
case CHAR_RPAREN:
if (!depth)
goto parse_literal;
case CHAR_STAR:
case CHAR_PIPE:
case CHAR_LBRACE:
case CHAR_PLUS:
case CHAR_QUESTIONMARK:
if (!(ctx->cflags & REG_EXTENDED))
goto parse_literal;
empty_atom:
result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
if (!result)
return REG_ESPACE;
break;
default:
parse_literal:
/* We are expecting an atom. If the subexpression (or the whole
regexp ends here, we interpret it as an empty expression
(which matches an empty string). */
if (
(!*ctx->re
|| *ctx->re == CHAR_STAR
|| (ctx->cflags & REG_EXTENDED
&& (*ctx->re == CHAR_PIPE
|| *ctx->re == CHAR_LBRACE
|| *ctx->re == CHAR_PLUS
|| *ctx->re == CHAR_QUESTIONMARK))
/* Test for "\)" in BRE mode. */
|| (!(ctx->cflags & REG_EXTENDED)
&& !*(ctx->re + 1)
&& *ctx->re == CHAR_BACKSLASH
&& *(ctx->re + 1) == CHAR_LBRACE)))
{
result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
if (!result)
return REG_ESPACE;
break;
}
wchar_t wc;
int clen = mbtowc(&wc, ctx->re, -1);
clen = mbtowc(&wc, ctx->re, -1);
if (clen<0) clen=1, wc=WEOF;
/* Note that we can't use an tre_isalpha() test here, since there