MAJOR: pattern: add LRU-based cache on pattern matching

The principle of this cache is to have a global cache for all pattern
matching operations which rely on lists (reg, sub, dir, dom, ...). The
input data, the expression and a random seed are used as a hashing key.
The cached entries contains a pointer to the expression and a revision
number for that expression so that we don't accidently used obsolete
data after a pattern update or a very unlikely hash collision.

Regarding the risk of collisions, 10k entries at 10k req/s mean 1% risk
of a collision after 60 years, that's already much less than the memory's
reliability in most machines and more durable than most admin's life
expectancy. A collision will result in a valid result to be returned
for a different entry from the same list. If this is not acceptable,
the cache can be disabled using tune.pattern.cache-size.

A test on a file containing 10k small regex showed that the regex
matching was limited to 6k/s instead of 70k with regular strings.
When enabling the LRU cache, the performance was back to 70k/s.
This commit is contained in:
Willy Tarreau 2015-04-29 16:24:50 +02:00
parent 72f073b6c7
commit f3045d2a06
6 changed files with 176 additions and 20 deletions

View File

@ -493,6 +493,7 @@ The following keywords are supported in the "global" section :
- tune.maxaccept
- tune.maxpollevents
- tune.maxrewrite
- tune.pattern.cache-size
- tune.pipesize
- tune.rcvbuf.client
- tune.rcvbuf.server
@ -1050,6 +1051,25 @@ tune.maxrewrite <number>
larger than that. This means you don't have to worry about it when changing
bufsize.
tune.pattern.cache-size <number>
Sets the size of the pattern lookup cache to <number> entries. This is an LRU
cache which reminds previous lookups and their results. It is used by ACLs
and maps on slow pattern lookups, namely the ones using the "sub", "reg",
"dir", "dom", "end", "bin" match methods as well as the case-insensitive
strings. It applies to pattern expressions which means that it will be able
to memorize the result of a lookup among all the patterns specified on a
configuration line (including all those loaded from files). It automatically
invalidates entries which are updated using HTTP actions or on the CLI. The
default cache size is set to 10000 entries, which limits its footprint to
about 5 MB on 32-bit systems and 8 MB on 64-bit systems. There is a very low
risk of collision in this cache, which is in the order of the size of the
cache divided by 2^64. Typically, at 10000 requests per second with the
default cache size of 10000 entries, there's 1% chance that a brute force
attack could cause a single collision after 60 years, or 0.1% after 6 years.
This is considered much lower than the risk of a memory corruption caused by
aging components. If this is not acceptable, the cache can be disabled by
setting this parameter to 0.
tune.pipesize <number>
Sets the kernel pipe buffer size to this size (in bytes). By default, pipes
are the default size for the system. But sometimes when using TCP splicing,

View File

@ -295,4 +295,15 @@
#ifndef TLS_TICKETS_NO
#define TLS_TICKETS_NO 3
#endif
/* pattern lookup default cache size, in number of entries :
* 10k entries at 10k req/s mean 1% risk of a collision after 60 years, that's
* already much less than the memory's reliability in most machines and more
* durable than most admin's life expectancy. A collision will result in a
* valid result to be returned for a different entry from the same list.
*/
#ifndef DEFAULT_PAT_LRU_SIZE
#define DEFAULT_PAT_LRU_SIZE 10000
#endif
#endif /* _COMMON_DEFAULTS_H */

View File

@ -142,6 +142,7 @@ struct global {
int pipesize; /* pipe size in bytes, system defaults if zero */
int max_http_hdr; /* max number of HTTP headers, use MAX_HTTP_HDR if zero */
int cookie_len; /* max length of cookie captures */
int pattern_cache; /* max number of entries in the pattern cache. */
int sslcachesize; /* SSL cache size in session, defaults to 20000 */
#ifdef USE_OPENSSL
int sslprivatecache; /* Force to use a private session cache even if nbproc > 1 */

View File

@ -907,6 +907,22 @@ int cfg_parse_global(const char *file, int linenum, char **args, int kwm)
goto out;
}
}
else if (!strcmp(args[0], "tune.pattern.cache-size")) {
if (*args[1]) {
global.tune.pattern_cache = atoi(args[1]);
if (global.tune.pattern_cache < 0) {
Alert("parsing [%s:%d] : '%s' expects a positive numeric value\n",
file, linenum, args[0]);
err_code |= ERR_ALERT | ERR_FATAL;
goto out;
}
} else {
Alert("parsing [%s:%d] : '%s' expects a positive numeric value\n",
file, linenum, args[0]);
err_code |= ERR_ALERT | ERR_FATAL;
goto out;
}
}
else if (!strcmp(args[0], "uid")) {
if (global.uid != 0) {
Alert("parsing [%s:%d] : user/uid already specified. Continuing.\n", file, linenum);

View File

@ -146,6 +146,7 @@ struct global global = {
.maxrewrite = MAXREWRITE,
.chksize = BUFSIZE,
.reserved_bufs = RESERVED_BUFS,
.pattern_cache = DEFAULT_PAT_LRU_SIZE,
#ifdef USE_OPENSSL
.sslcachesize = SSLCACHESIZE,
.ssl_default_dh_param = SSL_DEFAULT_DH_PARAM,

View File

@ -24,6 +24,8 @@
#include <proto/sample.h>
#include <ebsttree.h>
#include <import/lru.h>
#include <import/xxhash.h>
char *pat_match_names[PAT_MATCH_NUM] = {
[PAT_MATCH_FOUND] = "found",
@ -144,6 +146,9 @@ static struct pattern static_pattern;
/* This is the root of the list of all pattern_ref avalaibles. */
struct list pattern_reference = LIST_HEAD_INIT(pattern_reference);
static struct lru64_head *pat_lru_tree;
static unsigned long long pat_lru_seed;
/*
*
* The following functions are not exported and are used by internals process
@ -443,6 +448,8 @@ struct pattern *pat_match_str(struct sample *smp, struct pattern_expr *expr, int
struct pattern_tree *elt;
struct pattern_list *lst;
struct pattern *pattern;
struct pattern *ret = NULL;
struct lru64 *lru = NULL;
/* Lookup a string in the expression's pattern tree. */
if (!eb_is_empty(&expr->pattern_tree)) {
@ -468,6 +475,15 @@ struct pattern *pat_match_str(struct sample *smp, struct pattern_expr *expr, int
}
/* look in the list */
if (pat_lru_tree) {
unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
pat_lru_tree, expr, expr->revision);
if (lru && lru->domain)
return lru->data;
}
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
@ -476,11 +492,16 @@ struct pattern *pat_match_str(struct sample *smp, struct pattern_expr *expr, int
icase = expr->mflags & PAT_MF_IGNORE_CASE;
if ((icase && strncasecmp(pattern->ptr.str, smp->data.str.str, smp->data.str.len) == 0) ||
(!icase && strncmp(pattern->ptr.str, smp->data.str.str, smp->data.str.len) == 0))
return pattern;
(!icase && strncmp(pattern->ptr.str, smp->data.str.str, smp->data.str.len) == 0)) {
ret = pattern;
break;
}
}
return NULL;
if (lru)
lru64_commit(lru, ret, expr, expr->revision);
return ret;
}
/* NB: For two binaries buf to be identical, it is required that their lengths match */
@ -488,19 +509,34 @@ struct pattern *pat_match_bin(struct sample *smp, struct pattern_expr *expr, int
{
struct pattern_list *lst;
struct pattern *pattern;
struct pattern *ret = NULL;
struct lru64 *lru = NULL;
if (pat_lru_tree) {
unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
pat_lru_tree, expr, expr->revision);
if (lru && lru->domain)
return lru->data;
}
/* Look in the list. */
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
if (pattern->len != smp->data.str.len)
continue;
if (memcmp(pattern->ptr.str, smp->data.str.str, smp->data.str.len) == 0)
return pattern;
if (memcmp(pattern->ptr.str, smp->data.str.str, smp->data.str.len) == 0) {
ret = pattern;
break;
}
}
return NULL;
if (lru)
lru64_commit(lru, ret, expr, expr->revision);
return ret;
}
/* Executes a regex. It temporarily changes the data to add a trailing zero,
@ -510,15 +546,31 @@ struct pattern *pat_match_reg(struct sample *smp, struct pattern_expr *expr, int
{
struct pattern_list *lst;
struct pattern *pattern;
struct pattern *ret = NULL;
struct lru64 *lru = NULL;
if (pat_lru_tree) {
unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
pat_lru_tree, expr, expr->revision);
if (lru && lru->domain)
return lru->data;
}
/* look in the list */
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
if (regex_exec2(pattern->ptr.reg, smp->data.str.str, smp->data.str.len))
return pattern;
if (regex_exec2(pattern->ptr.reg, smp->data.str.str, smp->data.str.len)) {
ret = pattern;
break;
}
}
return NULL;
if (lru)
lru64_commit(lru, ret, expr, expr->revision);
return ret;
}
/* Checks that the pattern matches the beginning of the tested string. */
@ -530,6 +582,8 @@ struct pattern *pat_match_beg(struct sample *smp, struct pattern_expr *expr, int
struct pattern_tree *elt;
struct pattern_list *lst;
struct pattern *pattern;
struct pattern *ret = NULL;
struct lru64 *lru = NULL;
/* Lookup a string in the expression's pattern tree. */
if (!eb_is_empty(&expr->pattern_tree)) {
@ -555,6 +609,15 @@ struct pattern *pat_match_beg(struct sample *smp, struct pattern_expr *expr, int
}
/* look in the list */
if (pat_lru_tree) {
unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
pat_lru_tree, expr, expr->revision);
if (lru && lru->domain)
return lru->data;
}
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
@ -566,9 +629,14 @@ struct pattern *pat_match_beg(struct sample *smp, struct pattern_expr *expr, int
(!icase && strncmp(pattern->ptr.str, smp->data.str.str, pattern->len) != 0))
continue;
return pattern;
ret = pattern;
break;
}
return NULL;
if (lru)
lru64_commit(lru, ret, expr, expr->revision);
return ret;
}
/* Checks that the pattern matches the end of the tested string. */
@ -577,6 +645,17 @@ struct pattern *pat_match_end(struct sample *smp, struct pattern_expr *expr, int
int icase;
struct pattern_list *lst;
struct pattern *pattern;
struct pattern *ret = NULL;
struct lru64 *lru = NULL;
if (pat_lru_tree) {
unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
pat_lru_tree, expr, expr->revision);
if (lru && lru->domain)
return lru->data;
}
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
@ -589,9 +668,14 @@ struct pattern *pat_match_end(struct sample *smp, struct pattern_expr *expr, int
(!icase && strncmp(pattern->ptr.str, smp->data.str.str + smp->data.str.len - pattern->len, pattern->len) != 0))
continue;
return pattern;
ret = pattern;
break;
}
return NULL;
if (lru)
lru64_commit(lru, ret, expr, expr->revision);
return ret;
}
/* Checks that the pattern is included inside the tested string.
@ -604,6 +688,17 @@ struct pattern *pat_match_sub(struct sample *smp, struct pattern_expr *expr, int
char *c;
struct pattern_list *lst;
struct pattern *pattern;
struct pattern *ret = NULL;
struct lru64 *lru = NULL;
if (pat_lru_tree) {
unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
pat_lru_tree, expr, expr->revision);
if (lru && lru->domain)
return lru->data;
}
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
@ -617,19 +712,27 @@ struct pattern *pat_match_sub(struct sample *smp, struct pattern_expr *expr, int
for (c = smp->data.str.str; c <= end; c++) {
if (tolower(*c) != tolower(*pattern->ptr.str))
continue;
if (strncasecmp(pattern->ptr.str, c, pattern->len) == 0)
return pattern;
if (strncasecmp(pattern->ptr.str, c, pattern->len) == 0) {
ret = pattern;
goto leave;
}
}
} else {
for (c = smp->data.str.str; c <= end; c++) {
if (*c != *pattern->ptr.str)
continue;
if (strncmp(pattern->ptr.str, c, pattern->len) == 0)
return pattern;
if (strncmp(pattern->ptr.str, c, pattern->len) == 0) {
ret = pattern;
goto leave;
}
}
}
}
return NULL;
leave:
if (lru)
lru64_commit(lru, ret, expr, expr->revision);
return ret;
}
/* This one is used by other real functions. It checks that the pattern is
@ -2321,6 +2424,10 @@ void pattern_finalize_config(void)
struct pat_ref *ref, *ref2, *ref3;
struct list pr = LIST_HEAD_INIT(pr);
pat_lru_seed = random();
if (global.tune.pattern_cache)
pat_lru_tree = lru64_new(global.tune.pattern_cache);
list_for_each_entry(ref, &pattern_reference, list) {
if (ref->unique_id == -1) {
/* Look for the first free id. */