mirror of
git://git.musl-libc.org/musl
synced 2025-01-29 10:02:46 +00:00
fix ugly bugs in TRE regex parser
1. * in BRE is not special at the beginning of the regex or a subexpression. this broke ncurses' build scripts. 2. \\( in BRE is a literal \ followed by a literal (, not a literal \ followed by a subexpression opener. 3. the ^ in \\(^ in BRE is a literal ^ only at the beginning of the entire BRE. POSIX allows treating it as an anchor at the beginning of a subexpression, but TRE's code for checking if it was at the beginning of a subexpression was wrong, and fixing it for the sake of supporting a non-portable usage was too much trouble when just removing this non-portable behavior was much easier. this patch also moved lots of the ugly logic for empty atom checking out of the default/literal case and into new cases for the relevant characters. this should make parsing faster and make the code smaller. if nothing else it's a lot more readable/logical. at some point i'd like to revisit and overhaul lots of this code...
This commit is contained in:
parent
2d3e2a7fc1
commit
d7a90b35b9
@ -961,6 +961,8 @@ tre_parse(tre_parse_ctx_t *ctx)
|
||||
tre_stack_t *stack = ctx->stack;
|
||||
int bottom = tre_stack_num_objects(stack);
|
||||
int depth = 0;
|
||||
wchar_t wc;
|
||||
int clen;
|
||||
|
||||
if (!ctx->nofirstsub)
|
||||
{
|
||||
@ -1155,10 +1157,9 @@ tre_parse(tre_parse_ctx_t *ctx)
|
||||
{
|
||||
case CHAR_LPAREN: /* parenthesized subexpression */
|
||||
|
||||
if (ctx->cflags & REG_EXTENDED
|
||||
|| (ctx->re > ctx->re_start
|
||||
&& *(ctx->re - 1) == CHAR_BACKSLASH))
|
||||
if (ctx->cflags & REG_EXTENDED)
|
||||
{
|
||||
lparen:
|
||||
depth++;
|
||||
{
|
||||
ctx->re++;
|
||||
@ -1174,25 +1175,6 @@ tre_parse(tre_parse_ctx_t *ctx)
|
||||
goto parse_literal;
|
||||
break;
|
||||
|
||||
case CHAR_RPAREN: /* end of current subexpression */
|
||||
if ((ctx->cflags & REG_EXTENDED && depth > 0)
|
||||
|| (ctx->re > ctx->re_start
|
||||
&& *(ctx->re - 1) == CHAR_BACKSLASH))
|
||||
{
|
||||
/* We were expecting an atom, but instead the current
|
||||
subexpression was closed. POSIX leaves the meaning of
|
||||
this to be implementation-defined. We interpret this as
|
||||
an empty expression (which matches an empty string). */
|
||||
result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
|
||||
if (result == NULL)
|
||||
return REG_ESPACE;
|
||||
if (!(ctx->cflags & REG_EXTENDED))
|
||||
ctx->re--;
|
||||
}
|
||||
else
|
||||
goto parse_literal;
|
||||
break;
|
||||
|
||||
case CHAR_LBRACKET: /* bracket expression */
|
||||
ctx->re++;
|
||||
status = tre_parse_bracket(ctx, &result);
|
||||
@ -1203,13 +1185,14 @@ tre_parse(tre_parse_ctx_t *ctx)
|
||||
case CHAR_BACKSLASH:
|
||||
/* If this is "\(" or "\)" chew off the backslash and
|
||||
try again. */
|
||||
if (!(ctx->cflags & REG_EXTENDED)
|
||||
&& (*(ctx->re + 1) == CHAR_LPAREN
|
||||
|| *(ctx->re + 1) == CHAR_RPAREN))
|
||||
if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
|
||||
{
|
||||
ctx->re++;
|
||||
STACK_PUSHX(stack, int, PARSE_ATOM);
|
||||
break;
|
||||
goto lparen;
|
||||
}
|
||||
if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
|
||||
{
|
||||
goto empty_atom;
|
||||
}
|
||||
|
||||
/* If a macro is used, parse the expanded macro recursively. */
|
||||
@ -1369,12 +1352,9 @@ tre_parse(tre_parse_ctx_t *ctx)
|
||||
break;
|
||||
|
||||
case CHAR_CARET: /* beginning of line assertion */
|
||||
/* '^' has a special meaning everywhere in EREs, and in the
|
||||
beginning of the RE and after \( is BREs. */
|
||||
/* '^' has a special meaning everywhere in EREs, and at
|
||||
beginning of BRE. */
|
||||
if (ctx->cflags & REG_EXTENDED
|
||||
|| (ctx->re - 2 >= ctx->re_start
|
||||
&& *(ctx->re - 2) == CHAR_BACKSLASH
|
||||
&& *(ctx->re - 1) == CHAR_LPAREN)
|
||||
|| ctx->re == ctx->re_start)
|
||||
{
|
||||
result = tre_ast_new_literal(ctx->mem, ASSERTION,
|
||||
@ -1389,10 +1369,8 @@ tre_parse(tre_parse_ctx_t *ctx)
|
||||
|
||||
case CHAR_DOLLAR: /* end of line assertion. */
|
||||
/* '$' is special everywhere in EREs, and in the end of the
|
||||
string and before \) is BREs. */
|
||||
string in BREs. */
|
||||
if (ctx->cflags & REG_EXTENDED
|
||||
|| (*(ctx->re + 1) == CHAR_BACKSLASH
|
||||
&& *(ctx->re + 2) == CHAR_RPAREN)
|
||||
|| !*(ctx->re + 1))
|
||||
{
|
||||
result = tre_ast_new_literal(ctx->mem, ASSERTION,
|
||||
@ -1405,34 +1383,27 @@ tre_parse(tre_parse_ctx_t *ctx)
|
||||
goto parse_literal;
|
||||
break;
|
||||
|
||||
case CHAR_RPAREN:
|
||||
if (!depth)
|
||||
goto parse_literal;
|
||||
case CHAR_STAR:
|
||||
case CHAR_PIPE:
|
||||
case CHAR_LBRACE:
|
||||
case CHAR_PLUS:
|
||||
case CHAR_QUESTIONMARK:
|
||||
if (!(ctx->cflags & REG_EXTENDED))
|
||||
goto parse_literal;
|
||||
|
||||
empty_atom:
|
||||
result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
|
||||
if (!result)
|
||||
return REG_ESPACE;
|
||||
break;
|
||||
|
||||
default:
|
||||
parse_literal:
|
||||
|
||||
/* We are expecting an atom. If the subexpression (or the whole
|
||||
regexp ends here, we interpret it as an empty expression
|
||||
(which matches an empty string). */
|
||||
if (
|
||||
(!*ctx->re
|
||||
|| *ctx->re == CHAR_STAR
|
||||
|| (ctx->cflags & REG_EXTENDED
|
||||
&& (*ctx->re == CHAR_PIPE
|
||||
|| *ctx->re == CHAR_LBRACE
|
||||
|| *ctx->re == CHAR_PLUS
|
||||
|| *ctx->re == CHAR_QUESTIONMARK))
|
||||
/* Test for "\)" in BRE mode. */
|
||||
|| (!(ctx->cflags & REG_EXTENDED)
|
||||
&& !*(ctx->re + 1)
|
||||
&& *ctx->re == CHAR_BACKSLASH
|
||||
&& *(ctx->re + 1) == CHAR_LBRACE)))
|
||||
{
|
||||
result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
|
||||
if (!result)
|
||||
return REG_ESPACE;
|
||||
break;
|
||||
}
|
||||
|
||||
wchar_t wc;
|
||||
int clen = mbtowc(&wc, ctx->re, -1);
|
||||
clen = mbtowc(&wc, ctx->re, -1);
|
||||
if (clen<0) clen=1, wc=WEOF;
|
||||
|
||||
/* Note that we can't use an tre_isalpha() test here, since there
|
||||
|
Loading…
Reference in New Issue
Block a user