mirror of
http://git.haproxy.org/git/haproxy.git/
synced 2025-01-19 04:00:46 +00:00
MEDIUM: streams: measure processing time and abort when detecting bugs
On some occasions we've had loops happening when processing actions (e.g. a yield not being well understood) resulting in analysers being called in loops until the analysis timeout without incrementing the stream's call count, thus this type of bug cannot be caught by the current protection system. What this patch proposes is to start to measure the time spent in analysers when profiling is enabled on the thread, in order to detect if a stream is really misbehaving. In this case we measured the consumed CPU time, not the wall clock time, so as not to be affected by possible noisy neighbours sharing the same CPU. When more than 100ms are spent in an analyser, we trigger the stream_dump_and_crash() function to report the anomaly. The choice of 100ms comes from the fact that regular calls only take around 1 microsecond and it seems reasonable to accept a degradation factor of 100000, which covers very slow machines such as home gateways running on sub-ghz processors, with extremely heavy configurations. Some complete tests show that even this common bogus map_regm() entry supposedly designed to extract a port from an IP:port entry does not trigger the timeout (25 ms evaluation time for a 4kB header, exercise left to the reader to spot the mistake) : ([0-9]{0,3}).([0-9]{0,3}).([0-9]{0,3}).([0-9]{0,3}):([0-9]{0,5}) \5 However this one purposely designed to kill haproxy definitely dies as it manages to completely freeze the whole process for more than one second on a 4 GHz CPU for only 120 bytes in : (.{0,20})(.{0,20})(.{0,20})(.{0,20})(.{0,20})b \1 This protection will definitely help during the code stabilization period and may possibly be left enabled later depending on reported issues or not. If you've noticed that your workload is affected by this patch, please report it as you have very likely found a bug. And in the mean time you can turn profiling off to disable it.
This commit is contained in:
parent
3d07a16f14
commit
4ad574fbe2
32
src/stream.c
32
src/stream.c
@ -1712,6 +1712,7 @@ struct task *process_stream(struct task *t, void *context, unsigned short state)
|
||||
struct channel *req, *res;
|
||||
struct stream_interface *si_f, *si_b;
|
||||
unsigned int rate;
|
||||
uint64_t cpu_time_before, cpu_time_after;
|
||||
|
||||
activity[tid].stream++;
|
||||
|
||||
@ -1725,6 +1726,11 @@ struct task *process_stream(struct task *t, void *context, unsigned short state)
|
||||
si_sync_recv(si_f);
|
||||
si_sync_recv(si_b);
|
||||
|
||||
/* measure only the analysers' time */
|
||||
cpu_time_before = 0;
|
||||
if (task_profiling_mask & tid_bit)
|
||||
cpu_time_before = now_cpu_time();
|
||||
|
||||
redo:
|
||||
rate = update_freq_ctr(&s->call_rate, 1);
|
||||
if (rate >= 100000 && s->call_rate.prev_ctr) { // make sure to wait at least a full second
|
||||
@ -2602,6 +2608,19 @@ redo:
|
||||
s->pending_events &= ~(TASK_WOKEN_TIMER | TASK_WOKEN_RES);
|
||||
stream_release_buffers(s);
|
||||
/* We may have free'd some space in buffers, or have more to send/recv, try again */
|
||||
|
||||
if (cpu_time_before) {
|
||||
cpu_time_after = now_cpu_time();
|
||||
cpu_time_after -= cpu_time_before;
|
||||
if (cpu_time_after >= 100000000U) {
|
||||
/* more than 1 ms CPU time spent in analysers is already absolutely
|
||||
* insane, but on tiny machines where performance doesn't matter it
|
||||
* may happen when using tons of regex, so let's fix the threshold
|
||||
* to 100ms which must never ever be hit.
|
||||
*/
|
||||
stream_dump_and_crash(&s->obj_type, -(cpu_time_after / 1000));
|
||||
}
|
||||
}
|
||||
return t; /* nothing more to do */
|
||||
}
|
||||
|
||||
@ -2649,6 +2668,19 @@ redo:
|
||||
/* update time stats for this stream */
|
||||
stream_update_time_stats(s);
|
||||
|
||||
if (cpu_time_before) {
|
||||
cpu_time_after = now_cpu_time();
|
||||
cpu_time_after -= cpu_time_before;
|
||||
if (cpu_time_after >= 100000000U) {
|
||||
/* more than 1 ms CPU time spent in analysers is already absolutely
|
||||
* insane, but on tiny machines where performance doesn't matter it
|
||||
* may happen when using tons of regex, so let's fix the threshold
|
||||
* to 100ms which must never ever be hit.
|
||||
*/
|
||||
stream_dump_and_crash(&s->obj_type, -(cpu_time_after / 1000));
|
||||
}
|
||||
}
|
||||
|
||||
/* the task MUST not be in the run queue anymore */
|
||||
stream_free(s);
|
||||
task_destroy(t);
|
||||
|
Loading…
Reference in New Issue
Block a user