2006-06-26 00:48:02 +00:00
|
|
|
/*
|
2010-01-28 17:10:50 +00:00
|
|
|
* include/common/regex.h
|
|
|
|
* This file defines everything related to regular expressions.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2000-2010 Willy Tarreau - w@1wt.eu
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation, version 2.1
|
|
|
|
* exclusively.
|
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with this library; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*/
|
2006-06-26 00:48:02 +00:00
|
|
|
|
2006-06-29 15:53:05 +00:00
|
|
|
#ifndef _COMMON_REGEX_H
|
|
|
|
#define _COMMON_REGEX_H
|
2006-06-26 00:48:02 +00:00
|
|
|
|
2013-10-09 13:23:01 +00:00
|
|
|
#include <stdlib.h>
|
2015-11-26 18:33:54 +00:00
|
|
|
#include <string.h>
|
2013-10-09 13:23:01 +00:00
|
|
|
|
2020-05-27 10:58:42 +00:00
|
|
|
#include <haproxy/api.h>
|
2020-05-09 07:08:09 +00:00
|
|
|
#include <common/hathreads.h>
|
2006-06-26 00:48:02 +00:00
|
|
|
|
|
|
|
#ifdef USE_PCRE
|
|
|
|
#include <pcre.h>
|
|
|
|
#include <pcreposix.h>
|
MEDIUM: regex: Use pcre_study always when PCRE is used, regardless of JIT
pcre_study() has been around long before JIT has been added. It also seems to
affect the performance in some cases (positive).
Below I've attached some test restults. The test is based on
http://sljit.sourceforge.net/regex_perf.html (see bottom). It has been modified
to just test pcre_study vs. no pcre_study. Note: This test does not try to
match specific header it's instead run over a larger text with more and less
complex patterns to make the differences more clear.
% ./runtest
'mark.txt' loaded. (Length: 19665221 bytes)
-----------------
Regex: 'Twain'
[pcre-nostudy] time: 14 ms (2388 matches)
[pcre-study] time: 21 ms (2388 matches)
-----------------
Regex: '^Twain'
[pcre-nostudy] time: 109 ms (100 matches)
[pcre-study] time: 109 ms (100 matches)
-----------------
Regex: 'Twain$'
[pcre-nostudy] time: 14 ms (127 matches)
[pcre-study] time: 16 ms (127 matches)
-----------------
Regex: 'Huck[a-zA-Z]+|Finn[a-zA-Z]+'
[pcre-nostudy] time: 695 ms (83 matches)
[pcre-study] time: 26 ms (83 matches)
-----------------
Regex: 'a[^x]{20}b'
[pcre-nostudy] time: 90 ms (12495 matches)
[pcre-study] time: 91 ms (12495 matches)
-----------------
Regex: 'Tom|Sawyer|Huckleberry|Finn'
[pcre-nostudy] time: 1236 ms (3015 matches)
[pcre-study] time: 34 ms (3015 matches)
-----------------
Regex: '.{0,3}(Tom|Sawyer|Huckleberry|Finn)'
[pcre-nostudy] time: 5696 ms (3015 matches)
[pcre-study] time: 5655 ms (3015 matches)
-----------------
Regex: '[a-zA-Z]+ing'
[pcre-nostudy] time: 1290 ms (95863 matches)
[pcre-study] time: 1167 ms (95863 matches)
-----------------
Regex: '^[a-zA-Z]{0,4}ing[^a-zA-Z]'
[pcre-nostudy] time: 136 ms (4507 matches)
[pcre-study] time: 134 ms (4507 matches)
-----------------
Regex: '[a-zA-Z]+ing$'
[pcre-nostudy] time: 1334 ms (5360 matches)
[pcre-study] time: 1214 ms (5360 matches)
-----------------
Regex: '^[a-zA-Z ]{5,}$'
[pcre-nostudy] time: 198 ms (26236 matches)
[pcre-study] time: 197 ms (26236 matches)
-----------------
Regex: '^.{16,20}$'
[pcre-nostudy] time: 173 ms (4902 matches)
[pcre-study] time: 175 ms (4902 matches)
-----------------
Regex: '([a-f](.[d-m].){0,2}[h-n]){2}'
[pcre-nostudy] time: 1242 ms (68621 matches)
[pcre-study] time: 690 ms (68621 matches)
-----------------
Regex: '([A-Za-z]awyer|[A-Za-z]inn)[^a-zA-Z]'
[pcre-nostudy] time: 1215 ms (675 matches)
[pcre-study] time: 952 ms (675 matches)
-----------------
Regex: '"[^"]{0,30}[?!\.]"'
[pcre-nostudy] time: 27 ms (5972 matches)
[pcre-study] time: 28 ms (5972 matches)
-----------------
Regex: 'Tom.{10,25}river|river.{10,25}Tom'
[pcre-nostudy] time: 705 ms (2 matches)
[pcre-study] time: 68 ms (2 matches)
In some cases it's more or less the same but when it's faster than by a huge margin.
It always depends on the pattern, the string(s) to match against etc.
Signed-off-by: Christian Ruppert <c.ruppert@babiel.com>
2014-11-18 12:03:58 +00:00
|
|
|
|
|
|
|
/* For pre-8.20 PCRE compatibility */
|
|
|
|
#ifndef PCRE_STUDY_JIT_COMPILE
|
|
|
|
#define PCRE_STUDY_JIT_COMPILE 0
|
|
|
|
#endif
|
|
|
|
|
2016-11-21 21:25:58 +00:00
|
|
|
#elif USE_PCRE2
|
|
|
|
#include <pcre2.h>
|
|
|
|
#include <pcre2posix.h>
|
|
|
|
|
|
|
|
#else /* no PCRE, nor PCRE2 */
|
2013-12-06 19:36:20 +00:00
|
|
|
#include <regex.h>
|
|
|
|
#endif
|
2013-01-13 06:00:42 +00:00
|
|
|
|
2013-12-06 19:36:20 +00:00
|
|
|
struct my_regex {
|
|
|
|
#ifdef USE_PCRE
|
2014-06-18 09:50:51 +00:00
|
|
|
pcre *reg;
|
|
|
|
pcre_extra *extra;
|
2013-01-13 06:00:42 +00:00
|
|
|
#ifdef USE_PCRE_JIT
|
2013-10-14 12:07:36 +00:00
|
|
|
#ifndef PCRE_CONFIG_JIT
|
|
|
|
#error "The PCRE lib doesn't support JIT. Change your lib, or remove the option USE_PCRE_JIT."
|
|
|
|
#endif
|
2013-01-13 06:00:42 +00:00
|
|
|
#endif
|
2016-11-21 21:25:58 +00:00
|
|
|
#elif USE_PCRE2
|
|
|
|
pcre2_code *reg;
|
2013-01-13 06:00:42 +00:00
|
|
|
#else /* no PCRE */
|
2013-12-06 19:36:20 +00:00
|
|
|
regex_t regex;
|
2006-06-26 00:48:02 +00:00
|
|
|
#endif
|
2013-12-06 19:36:20 +00:00
|
|
|
};
|
2006-06-26 00:48:02 +00:00
|
|
|
|
|
|
|
struct hdr_exp {
|
|
|
|
struct hdr_exp *next;
|
2014-06-18 09:35:54 +00:00
|
|
|
struct my_regex *preg; /* expression to look for */
|
2006-10-15 13:17:57 +00:00
|
|
|
const char *replace; /* expression to set instead */
|
2010-01-28 17:10:50 +00:00
|
|
|
void *cond; /* a possible condition or NULL */
|
2006-06-26 00:48:02 +00:00
|
|
|
};
|
|
|
|
|
2017-06-15 09:53:49 +00:00
|
|
|
extern THREAD_LOCAL regmatch_t pmatch[MAX_MATCH];
|
2006-06-26 00:48:02 +00:00
|
|
|
|
2013-10-14 12:07:36 +00:00
|
|
|
/* "str" is the string that contain the regex to compile.
|
|
|
|
* "regex" is preallocated memory. After the execution of this function, this
|
|
|
|
* struct contain the compiled regex.
|
|
|
|
* "cs" is the case sensitive flag. If cs is true, case sensitive is enabled.
|
|
|
|
* "cap" is capture flag. If cap if true the regex can capture into
|
|
|
|
* parenthesis strings.
|
2020-03-10 07:06:11 +00:00
|
|
|
* "err" is the standard error message pointer.
|
2013-10-14 12:07:36 +00:00
|
|
|
*
|
2020-03-10 07:06:11 +00:00
|
|
|
* The function return 1 is success case, else return 0 and err is filled.
|
2013-10-14 12:07:36 +00:00
|
|
|
*/
|
2019-04-30 13:54:36 +00:00
|
|
|
struct my_regex *regex_comp(const char *str, int cs, int cap, char **err);
|
2014-05-28 21:05:07 +00:00
|
|
|
int exp_replace(char *dst, unsigned int dst_size, char *src, const char *str, const regmatch_t *matches);
|
2006-10-15 13:17:57 +00:00
|
|
|
const char *check_replace_string(const char *str);
|
2006-06-26 00:48:02 +00:00
|
|
|
|
2014-06-11 11:59:05 +00:00
|
|
|
/* If the function doesn't match, it returns false, else it returns true.
|
|
|
|
*/
|
|
|
|
static inline int regex_exec(const struct my_regex *preg, char *subject) {
|
2014-06-18 09:50:51 +00:00
|
|
|
#if defined(USE_PCRE) || defined(USE_PCRE_JIT)
|
2014-06-11 11:59:05 +00:00
|
|
|
if (pcre_exec(preg->reg, preg->extra, subject, strlen(subject), 0, 0, NULL, 0) < 0)
|
|
|
|
return 0;
|
|
|
|
return 1;
|
2016-11-21 21:25:58 +00:00
|
|
|
#elif defined(USE_PCRE2)
|
|
|
|
pcre2_match_data *pm;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
pm = pcre2_match_data_create_from_pattern(preg->reg, NULL);
|
|
|
|
ret = pcre2_match(preg->reg, (PCRE2_SPTR)subject, (PCRE2_SIZE)strlen(subject),
|
|
|
|
0, 0, pm, NULL);
|
|
|
|
pcre2_match_data_free(pm);
|
|
|
|
if (ret < 0)
|
|
|
|
return 0;
|
|
|
|
return 1;
|
2014-06-11 11:59:05 +00:00
|
|
|
#else
|
|
|
|
int match;
|
|
|
|
match = regexec(&preg->regex, subject, 0, NULL, 0);
|
|
|
|
if (match == REG_NOMATCH)
|
|
|
|
return 0;
|
|
|
|
return 1;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2013-10-15 11:41:44 +00:00
|
|
|
/* Note that <subject> MUST be at least <length+1> characters long and must
|
|
|
|
* be writable because the function will temporarily force a zero past the
|
|
|
|
* last character.
|
2014-06-11 11:59:05 +00:00
|
|
|
*
|
|
|
|
* If the function doesn't match, it returns false, else it returns true.
|
2013-10-15 11:41:44 +00:00
|
|
|
*/
|
2014-06-11 11:59:05 +00:00
|
|
|
static inline int regex_exec2(const struct my_regex *preg, char *subject, int length) {
|
2014-06-18 09:50:51 +00:00
|
|
|
#if defined(USE_PCRE) || defined(USE_PCRE_JIT)
|
2014-06-11 11:59:05 +00:00
|
|
|
if (pcre_exec(preg->reg, preg->extra, subject, length, 0, 0, NULL, 0) < 0)
|
|
|
|
return 0;
|
|
|
|
return 1;
|
2016-11-21 21:25:58 +00:00
|
|
|
#elif defined(USE_PCRE2)
|
|
|
|
pcre2_match_data *pm;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
pm = pcre2_match_data_create_from_pattern(preg->reg, NULL);
|
|
|
|
ret = pcre2_match(preg->reg, (PCRE2_SPTR)subject, (PCRE2_SIZE)length,
|
|
|
|
0, 0, pm, NULL);
|
|
|
|
pcre2_match_data_free(pm);
|
|
|
|
if (ret < 0)
|
|
|
|
return 0;
|
|
|
|
return 1;
|
2013-01-13 06:00:42 +00:00
|
|
|
#else
|
2013-10-15 11:41:44 +00:00
|
|
|
int match;
|
|
|
|
char old_char = subject[length];
|
|
|
|
subject[length] = 0;
|
2013-12-06 19:36:20 +00:00
|
|
|
match = regexec(&preg->regex, subject, 0, NULL, 0);
|
2013-10-15 11:41:44 +00:00
|
|
|
subject[length] = old_char;
|
2014-06-11 11:59:05 +00:00
|
|
|
if (match == REG_NOMATCH)
|
|
|
|
return 0;
|
|
|
|
return 1;
|
2013-01-13 06:00:42 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2014-06-11 11:59:05 +00:00
|
|
|
int regex_exec_match(const struct my_regex *preg, const char *subject,
|
2015-01-21 12:39:42 +00:00
|
|
|
size_t nmatch, regmatch_t pmatch[], int flags);
|
2014-06-11 11:59:05 +00:00
|
|
|
int regex_exec_match2(const struct my_regex *preg, char *subject, int length,
|
2015-01-21 12:39:42 +00:00
|
|
|
size_t nmatch, regmatch_t pmatch[], int flags);
|
2014-06-11 11:59:05 +00:00
|
|
|
|
2013-12-06 19:36:20 +00:00
|
|
|
static inline void regex_free(struct my_regex *preg) {
|
2019-04-30 13:54:36 +00:00
|
|
|
if (!preg)
|
|
|
|
return;
|
2014-06-18 09:50:51 +00:00
|
|
|
#if defined(USE_PCRE) || defined(USE_PCRE_JIT)
|
|
|
|
pcre_free(preg->reg);
|
MEDIUM: regex: Use pcre_study always when PCRE is used, regardless of JIT
pcre_study() has been around long before JIT has been added. It also seems to
affect the performance in some cases (positive).
Below I've attached some test restults. The test is based on
http://sljit.sourceforge.net/regex_perf.html (see bottom). It has been modified
to just test pcre_study vs. no pcre_study. Note: This test does not try to
match specific header it's instead run over a larger text with more and less
complex patterns to make the differences more clear.
% ./runtest
'mark.txt' loaded. (Length: 19665221 bytes)
-----------------
Regex: 'Twain'
[pcre-nostudy] time: 14 ms (2388 matches)
[pcre-study] time: 21 ms (2388 matches)
-----------------
Regex: '^Twain'
[pcre-nostudy] time: 109 ms (100 matches)
[pcre-study] time: 109 ms (100 matches)
-----------------
Regex: 'Twain$'
[pcre-nostudy] time: 14 ms (127 matches)
[pcre-study] time: 16 ms (127 matches)
-----------------
Regex: 'Huck[a-zA-Z]+|Finn[a-zA-Z]+'
[pcre-nostudy] time: 695 ms (83 matches)
[pcre-study] time: 26 ms (83 matches)
-----------------
Regex: 'a[^x]{20}b'
[pcre-nostudy] time: 90 ms (12495 matches)
[pcre-study] time: 91 ms (12495 matches)
-----------------
Regex: 'Tom|Sawyer|Huckleberry|Finn'
[pcre-nostudy] time: 1236 ms (3015 matches)
[pcre-study] time: 34 ms (3015 matches)
-----------------
Regex: '.{0,3}(Tom|Sawyer|Huckleberry|Finn)'
[pcre-nostudy] time: 5696 ms (3015 matches)
[pcre-study] time: 5655 ms (3015 matches)
-----------------
Regex: '[a-zA-Z]+ing'
[pcre-nostudy] time: 1290 ms (95863 matches)
[pcre-study] time: 1167 ms (95863 matches)
-----------------
Regex: '^[a-zA-Z]{0,4}ing[^a-zA-Z]'
[pcre-nostudy] time: 136 ms (4507 matches)
[pcre-study] time: 134 ms (4507 matches)
-----------------
Regex: '[a-zA-Z]+ing$'
[pcre-nostudy] time: 1334 ms (5360 matches)
[pcre-study] time: 1214 ms (5360 matches)
-----------------
Regex: '^[a-zA-Z ]{5,}$'
[pcre-nostudy] time: 198 ms (26236 matches)
[pcre-study] time: 197 ms (26236 matches)
-----------------
Regex: '^.{16,20}$'
[pcre-nostudy] time: 173 ms (4902 matches)
[pcre-study] time: 175 ms (4902 matches)
-----------------
Regex: '([a-f](.[d-m].){0,2}[h-n]){2}'
[pcre-nostudy] time: 1242 ms (68621 matches)
[pcre-study] time: 690 ms (68621 matches)
-----------------
Regex: '([A-Za-z]awyer|[A-Za-z]inn)[^a-zA-Z]'
[pcre-nostudy] time: 1215 ms (675 matches)
[pcre-study] time: 952 ms (675 matches)
-----------------
Regex: '"[^"]{0,30}[?!\.]"'
[pcre-nostudy] time: 27 ms (5972 matches)
[pcre-study] time: 28 ms (5972 matches)
-----------------
Regex: 'Tom.{10,25}river|river.{10,25}Tom'
[pcre-nostudy] time: 705 ms (2 matches)
[pcre-study] time: 68 ms (2 matches)
In some cases it's more or less the same but when it's faster than by a huge margin.
It always depends on the pattern, the string(s) to match against etc.
Signed-off-by: Christian Ruppert <c.ruppert@babiel.com>
2014-11-18 12:03:58 +00:00
|
|
|
/* PCRE < 8.20 requires pcre_free() while >= 8.20 requires pcre_study_free(),
|
|
|
|
* which is easily detected using PCRE_CONFIG_JIT.
|
|
|
|
*/
|
|
|
|
#ifdef PCRE_CONFIG_JIT
|
2013-01-13 06:00:42 +00:00
|
|
|
pcre_free_study(preg->extra);
|
MEDIUM: regex: Use pcre_study always when PCRE is used, regardless of JIT
pcre_study() has been around long before JIT has been added. It also seems to
affect the performance in some cases (positive).
Below I've attached some test restults. The test is based on
http://sljit.sourceforge.net/regex_perf.html (see bottom). It has been modified
to just test pcre_study vs. no pcre_study. Note: This test does not try to
match specific header it's instead run over a larger text with more and less
complex patterns to make the differences more clear.
% ./runtest
'mark.txt' loaded. (Length: 19665221 bytes)
-----------------
Regex: 'Twain'
[pcre-nostudy] time: 14 ms (2388 matches)
[pcre-study] time: 21 ms (2388 matches)
-----------------
Regex: '^Twain'
[pcre-nostudy] time: 109 ms (100 matches)
[pcre-study] time: 109 ms (100 matches)
-----------------
Regex: 'Twain$'
[pcre-nostudy] time: 14 ms (127 matches)
[pcre-study] time: 16 ms (127 matches)
-----------------
Regex: 'Huck[a-zA-Z]+|Finn[a-zA-Z]+'
[pcre-nostudy] time: 695 ms (83 matches)
[pcre-study] time: 26 ms (83 matches)
-----------------
Regex: 'a[^x]{20}b'
[pcre-nostudy] time: 90 ms (12495 matches)
[pcre-study] time: 91 ms (12495 matches)
-----------------
Regex: 'Tom|Sawyer|Huckleberry|Finn'
[pcre-nostudy] time: 1236 ms (3015 matches)
[pcre-study] time: 34 ms (3015 matches)
-----------------
Regex: '.{0,3}(Tom|Sawyer|Huckleberry|Finn)'
[pcre-nostudy] time: 5696 ms (3015 matches)
[pcre-study] time: 5655 ms (3015 matches)
-----------------
Regex: '[a-zA-Z]+ing'
[pcre-nostudy] time: 1290 ms (95863 matches)
[pcre-study] time: 1167 ms (95863 matches)
-----------------
Regex: '^[a-zA-Z]{0,4}ing[^a-zA-Z]'
[pcre-nostudy] time: 136 ms (4507 matches)
[pcre-study] time: 134 ms (4507 matches)
-----------------
Regex: '[a-zA-Z]+ing$'
[pcre-nostudy] time: 1334 ms (5360 matches)
[pcre-study] time: 1214 ms (5360 matches)
-----------------
Regex: '^[a-zA-Z ]{5,}$'
[pcre-nostudy] time: 198 ms (26236 matches)
[pcre-study] time: 197 ms (26236 matches)
-----------------
Regex: '^.{16,20}$'
[pcre-nostudy] time: 173 ms (4902 matches)
[pcre-study] time: 175 ms (4902 matches)
-----------------
Regex: '([a-f](.[d-m].){0,2}[h-n]){2}'
[pcre-nostudy] time: 1242 ms (68621 matches)
[pcre-study] time: 690 ms (68621 matches)
-----------------
Regex: '([A-Za-z]awyer|[A-Za-z]inn)[^a-zA-Z]'
[pcre-nostudy] time: 1215 ms (675 matches)
[pcre-study] time: 952 ms (675 matches)
-----------------
Regex: '"[^"]{0,30}[?!\.]"'
[pcre-nostudy] time: 27 ms (5972 matches)
[pcre-study] time: 28 ms (5972 matches)
-----------------
Regex: 'Tom.{10,25}river|river.{10,25}Tom'
[pcre-nostudy] time: 705 ms (2 matches)
[pcre-study] time: 68 ms (2 matches)
In some cases it's more or less the same but when it's faster than by a huge margin.
It always depends on the pattern, the string(s) to match against etc.
Signed-off-by: Christian Ruppert <c.ruppert@babiel.com>
2014-11-18 12:03:58 +00:00
|
|
|
#else /* PCRE_CONFIG_JIT */
|
|
|
|
pcre_free(preg->extra);
|
|
|
|
#endif /* PCRE_CONFIG_JIT */
|
2016-11-21 21:25:58 +00:00
|
|
|
#elif defined(USE_PCRE2) || defined(USE_PCRE2_JIT)
|
|
|
|
pcre2_code_free(preg->reg);
|
2013-01-13 06:00:42 +00:00
|
|
|
#else
|
2013-12-06 19:36:20 +00:00
|
|
|
regfree(&preg->regex);
|
2013-01-13 06:00:42 +00:00
|
|
|
#endif
|
2019-04-30 13:54:36 +00:00
|
|
|
free(preg);
|
2013-01-13 06:00:42 +00:00
|
|
|
}
|
|
|
|
|
2006-06-29 15:53:05 +00:00
|
|
|
#endif /* _COMMON_REGEX_H */
|
2006-06-26 00:48:02 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Local variables:
|
|
|
|
* c-indent-level: 8
|
|
|
|
* c-basic-offset: 8
|
|
|
|
* End:
|
|
|
|
*/
|