From 58e36e5b144da7a810a84a041a68b1ebf5ec516b Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Thu, 6 Apr 2023 22:51:56 +0200 Subject: [PATCH] MEDIUM: hlua: introduce tune.lua.burst-timeout The "burst" execution timeout applies to any Lua handler. If the handler fails to finish or yield before timeout is reached, handler will be aborted to prevent thread contention, to prevent traffic from not being served for too long, and ultimately to prevent the process from crashing because of the watchdog kicking in. Default value is 1000ms. Combined with forced-yield default value of 10000 lua instructions, it should be high enough to prevent any existing script breakage, while still being able to catch slow lua converters or sample fetches doing thread contention and risking the process stability. Setting value to 0 completely bypasses this check. (not recommended but could be required to restore original behavior if this feature breaks existing setups somehow...) No backport needed, although it could be used to prevent watchdog crashes due to poorly coded (slow/cpu consuming) lua sample fetches/converters. --- doc/configuration.txt | 50 +++++++++++++++++++++++++++++++++++++++++++ doc/lua.txt | 3 ++- src/hlua.c | 18 +++++++++++++++- 3 files changed, 69 insertions(+), 2 deletions(-) diff --git a/doc/configuration.txt b/doc/configuration.txt index d098695e3..82721f43a 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -3055,6 +3055,56 @@ tune.lua.session-timeout counts only the pure Lua runtime. If the Lua does a sleep, the sleep is not taken in account. The default timeout is 4s. +tune.lua.burst-timeout + The "burst" execution timeout applies to any Lua handler. If the handler + fails to finish or yield before timeout is reached, it will be aborted to + prevent thread contention, to prevent traffic from not being served for too + long, and ultimately to prevent the process from crashing because of the + watchdog kicking in. Unlike other lua timeouts which are yield-cumulative, + burst-timeout will ensure that the time spent in a single lua execution + window does not exceed the configured timeout. + + Yielding here means that the lua execution is effectively interrupted + either through an explicit call to lua-yielding function such as + core.(m)sleep() or core.yield(), or following an automatic forced-yield + (see tune.lua.forced-yield) and that it will be resumed later when the + related task is set for rescheduling. Not all lua handlers may yield: we have + to make a distinction between yieldable handlers and unyieldable handlers. + + For yieldable handlers (tasks, actions..), reaching the timeout means + "tune.lua.forced-yield" might be too high for the system, reducing it + could improve the situation, but it could also be a good idea to check if + adding manual yields at some key points within the lua function helps or not. + It may also indicate that the handler is spending too much time in a specific + lua library function that cannot be interrupted. + + For unyieldable handlers (lua converters, sample fetches), it could simply + indicate that the handler is doing too much computation, which could result + from an improper design given that such handlers, which often block the + request execution flow, are expected to terminate quickly to allow the + request processing to go through. A common resolution approach here would be + to try to better optimize the lua function for speed since decreasing + "tune.lua.forced-yield" won't help. + + This timeout only counts the pure Lua runtime. If the Lua does a core.sleep, + the sleeping time is not taken in account. The default timeout is 1000ms. + + Note: if a lua GC cycle is initiated from the handler (either explicitly + requested or automatically triggered by lua after some time), the GC cycle + time will also be accounted for. + + Indeed, there is no way to deduce the GC cycle time, so this could lead to + some false positives on saturated systems (where GC is having hard time to + catch up and consumes most of the available execution runtime). If it were + to be the case, here are some resolution leads: + + - checking if the script could be optimized to reduce lua memory footprint + - fine-tuning lua GC parameters and / or requesting manual GC cycles + (see: https://www.lua.org/manual/5.4/manual.html#pdf-collectgarbage) + - increasing tune.lua.burst-timeout + + Setting value to 0 completely disables this protection. + tune.lua.service-timeout This is the execution timeout for the Lua services. This is useful for preventing infinite loops or spending too much time in Lua. This timeout diff --git a/doc/lua.txt b/doc/lua.txt index 3ef0d3168..d6fbc2f0b 100644 --- a/doc/lua.txt +++ b/doc/lua.txt @@ -253,9 +253,10 @@ not the difference between the end time and the start time. The groups are: The corresponding tune options are: - - tune.lua.session-timeout (fetches, converters and action) + - tune.lua.session-timeout (action, filter, cli) - tune.lua.task-timeout (task) - tune.lua.service-timeout (services) + - tune.lua.burst-timeout (max time between two lua yields) The task does not have a timeout because it runs in background along the HAProxy process life. diff --git a/src/hlua.c b/src/hlua.c index 492ee7e76..bea0cf754 100644 --- a/src/hlua.c +++ b/src/hlua.c @@ -354,6 +354,7 @@ static int class_txn_reply_ref; * with hlua timer's API exclusively. * 0 means no timeout */ +static uint32_t hlua_timeout_burst = 1000; /* burst timeout. */ static uint32_t hlua_timeout_session = 4000; /* session timeout. */ static uint32_t hlua_timeout_task = 0; /* task timeout. */ static uint32_t hlua_timeout_applet = 4000; /* applet timeout. */ @@ -423,7 +424,11 @@ static inline void hlua_timer_stop(struct hlua_timer *timer) timer->burst += _hlua_time_burst(timer); } -/* check the timers for current hlua context +/* check the timers for current hlua context: + * - first check for burst timeout (max execution time for the current + hlua resume, ie: time between effective yields) + * - then check for yield cumulative timeout + * * Returns 1 if the check succeeded and 0 if it failed * (ie: timeout exceeded) */ @@ -431,6 +436,8 @@ static inline int hlua_timer_check(const struct hlua_timer *timer) { uint32_t pburst = _hlua_time_burst(timer); /* pending burst time in ms */ + if (hlua_timeout_burst && (timer->burst + pburst) > hlua_timeout_burst) + return 0; /* burst timeout exceeded */ if (timer->max && (timer->cumulative + timer->burst + pburst) > timer->max) return 0; /* cumulative timeout exceeded */ return 1; /* ok */ @@ -11967,6 +11974,14 @@ static int hlua_read_timeout(char **args, int section_type, struct proxy *curpx, return 0; } +static int hlua_burst_timeout(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + return hlua_read_timeout(args, section_type, curpx, defpx, + file, line, err, &hlua_timeout_burst); +} + static int hlua_session_timeout(char **args, int section_type, struct proxy *curpx, const struct proxy *defpx, const char *file, int line, char **err) @@ -12269,6 +12284,7 @@ static struct cfg_kw_list cfg_kws = {{ },{ { CFG_GLOBAL, "tune.lua.session-timeout", hlua_session_timeout }, { CFG_GLOBAL, "tune.lua.task-timeout", hlua_task_timeout }, { CFG_GLOBAL, "tune.lua.service-timeout", hlua_applet_timeout }, + { CFG_GLOBAL, "tune.lua.burst-timeout", hlua_burst_timeout }, { CFG_GLOBAL, "tune.lua.forced-yield", hlua_forced_yield }, { CFG_GLOBAL, "tune.lua.maxmem", hlua_parse_maxmem }, { 0, NULL, NULL },