diff --git a/include/haproxy/clock.h b/include/haproxy/clock.h index 045ab5a40..448b0c8d0 100644 --- a/include/haproxy/clock.h +++ b/include/haproxy/clock.h @@ -41,5 +41,8 @@ void clock_update_date(int max_wait, int interrupted); void clock_init_process_date(void); void clock_init_thread_date(void); char *timeofday_as_iso_us(int pad); +uint clock_report_idle(void); +void clock_leaving_poll(int timeout, int interrupted); +void clock_entering_poll(void); #endif diff --git a/include/haproxy/task.h b/include/haproxy/task.h index 4acbcad96..c1261c799 100644 --- a/include/haproxy/task.h +++ b/include/haproxy/task.h @@ -93,9 +93,6 @@ extern volatile unsigned long global_tasks_mask; /* Mask of threads with tasks i extern unsigned int grq_total; /* total number of entries in the global run queue, atomic */ extern unsigned int niced_tasks; /* number of niced tasks in the run queue */ -extern THREAD_LOCAL unsigned int samp_time; /* total elapsed time over current sample */ -extern THREAD_LOCAL unsigned int idle_time; /* total idle time over current sample */ - extern struct pool_head *pool_head_task; extern struct pool_head *pool_head_tasklet; extern struct pool_head *pool_head_notification; @@ -117,10 +114,6 @@ void tasklet_kill(struct tasklet *t); void __task_wakeup(struct task *t); void __task_queue(struct task *task, struct eb_root *wq); -uint sched_report_idle(); -void sched_leaving_poll(int timeout, int interrupted); -void sched_entering_poll(); - unsigned int run_tasks_from_lists(unsigned int budgets[]); /* diff --git a/src/clock.c b/src/clock.c index ecaeca928..f68830efb 100644 --- a/src/clock.c +++ b/src/clock.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -31,6 +32,8 @@ THREAD_LOCAL struct timeval date; /* the real current date (wall THREAD_LOCAL struct timeval before_poll; /* system date before calling poll() */ THREAD_LOCAL struct timeval after_poll; /* system date after leaving poll() */ +static THREAD_LOCAL unsigned int samp_time; /* total elapsed time over current sample */ +static THREAD_LOCAL unsigned int idle_time; /* total idle time over current sample */ static THREAD_LOCAL unsigned int iso_time_sec; /* last iso time value for this thread */ static THREAD_LOCAL char iso_time_str[34]; /* ISO time representation of gettimeofday() */ @@ -212,6 +215,95 @@ void clock_init_thread_date(void) clock_update_date(0, 1); } +/* report the average CPU idle percentage over all running threads, between 0 and 100 */ +uint clock_report_idle(void) +{ + uint total = 0; + uint rthr = 0; + uint thr; + + for (thr = 0; thr < MAX_THREADS; thr++) { + if (!(all_threads_mask & (1UL << thr))) + continue; + total += HA_ATOMIC_LOAD(&ha_thread_info[thr].idle_pct); + rthr++; + } + return rthr ? total / rthr : 0; +} + +/* Update the idle time value twice a second, to be called after + * clock_update_date() when called after poll(), and currently called only by + * clock_leaving_poll() below. It relies on to be updated to + * the system time before calling poll(). + */ +static inline void clock_measure_idle(void) +{ + /* Let's compute the idle to work ratio. We worked between after_poll + * and before_poll, and slept between before_poll and date. The idle_pct + * is updated at most twice every second. Note that the current second + * rarely changes so we avoid a multiply when not needed. + */ + int delta; + + if ((delta = date.tv_sec - before_poll.tv_sec)) + delta *= 1000000; + idle_time += delta + (date.tv_usec - before_poll.tv_usec); + + if ((delta = date.tv_sec - after_poll.tv_sec)) + delta *= 1000000; + samp_time += delta + (date.tv_usec - after_poll.tv_usec); + + after_poll.tv_sec = date.tv_sec; after_poll.tv_usec = date.tv_usec; + if (samp_time < 500000) + return; + + HA_ATOMIC_STORE(&ti->idle_pct, (100ULL * idle_time + samp_time / 2) / samp_time); + idle_time = samp_time = 0; +} + +/* Collect date and time information after leaving poll(). must be + * set to the maximum sleep time passed to poll (in milliseconds), and + * must be zero if the poller reached the timeout or non-zero + * otherwise, which generally is provided by the poller's return value. + */ +void clock_leaving_poll(int timeout, int interrupted) +{ + clock_measure_idle(); + ti->prev_cpu_time = now_cpu_time(); + ti->prev_mono_time = now_mono_time(); +} + +/* Collect date and time information before calling poll(). This will be used + * to count the run time of the past loop and the sleep time of the next poll. + * It also compares the elasped and cpu times during the activity period to + * estimate the amount of stolen time, which is reported if higher than half + * a millisecond. + */ +void clock_entering_poll(void) +{ + uint64_t new_mono_time; + uint64_t new_cpu_time; + int64_t stolen; + + gettimeofday(&before_poll, NULL); + + new_cpu_time = now_cpu_time(); + new_mono_time = now_mono_time(); + + if (ti->prev_cpu_time && ti->prev_mono_time) { + new_cpu_time -= ti->prev_cpu_time; + new_mono_time -= ti->prev_mono_time; + stolen = new_mono_time - new_cpu_time; + if (unlikely(stolen >= 500000)) { + stolen /= 500000; + /* more than half a millisecond difference might + * indicate an undesired preemption. + */ + report_stolen_time(stolen); + } + } +} + /* returns the current date as returned by gettimeofday() in ISO+microsecond * format. It uses a thread-local static variable that the reader can consume * for as long as it wants until next call. Thus, do not call it from a signal diff --git a/src/ev_epoll.c b/src/ev_epoll.c index 03888d0a3..a9b572b2e 100644 --- a/src/ev_epoll.c +++ b/src/ev_epoll.c @@ -189,7 +189,7 @@ static void _do_poll(struct poller *p, int exp, int wake) /* now let's wait for polled events */ wait_time = wake ? 0 : compute_poll_timeout(exp); - sched_entering_poll(); + clock_entering_poll(); activity_count_runtime(); do { int timeout = (global.tune.options & GTUNE_BUSY_POLLING) ? 0 : wait_time; @@ -209,7 +209,7 @@ static void _do_poll(struct poller *p, int exp, int wake) break; } while (1); - sched_leaving_poll(wait_time, status); + clock_leaving_poll(wait_time, status); thread_harmless_end(); thread_idle_end(); diff --git a/src/ev_evports.c b/src/ev_evports.c index 23bf11ffa..dcc30a195 100644 --- a/src/ev_evports.c +++ b/src/ev_evports.c @@ -159,7 +159,7 @@ static void _do_poll(struct poller *p, int exp, int wake) * Determine how long to wait for events to materialise on the port. */ wait_time = wake ? 0 : compute_poll_timeout(exp); - sched_entering_poll(); + clock_entering_poll(); activity_count_runtime(); do { @@ -203,7 +203,7 @@ static void _do_poll(struct poller *p, int exp, int wake) break; } while(1); - sched_leaving_poll(wait_time, nevlist); + clock_leaving_poll(wait_time, nevlist); thread_harmless_end(); thread_idle_end(); diff --git a/src/ev_kqueue.c b/src/ev_kqueue.c index df8eae263..c9f355b93 100644 --- a/src/ev_kqueue.c +++ b/src/ev_kqueue.c @@ -146,7 +146,7 @@ static void _do_poll(struct poller *p, int exp, int wake) /* now let's wait for events */ wait_time = wake ? 0 : compute_poll_timeout(exp); fd = global.tune.maxpollevents; - sched_entering_poll(); + clock_entering_poll(); activity_count_runtime(); do { @@ -175,7 +175,7 @@ static void _do_poll(struct poller *p, int exp, int wake) break; } while (1); - sched_leaving_poll(wait_time, status); + clock_leaving_poll(wait_time, status); thread_harmless_end(); thread_idle_end(); diff --git a/src/ev_poll.c b/src/ev_poll.c index 9b93c92e5..dbfa26269 100644 --- a/src/ev_poll.c +++ b/src/ev_poll.c @@ -202,11 +202,11 @@ static void _do_poll(struct poller *p, int exp, int wake) /* now let's wait for events */ wait_time = wake ? 0 : compute_poll_timeout(exp); - sched_entering_poll(); + clock_entering_poll(); activity_count_runtime(); status = poll(poll_events, nbfd, wait_time); clock_update_date(wait_time, status); - sched_leaving_poll(wait_time, status); + clock_leaving_poll(wait_time, status); thread_harmless_end(); thread_idle_end(); diff --git a/src/ev_select.c b/src/ev_select.c index 47da75dac..87a926d84 100644 --- a/src/ev_select.c +++ b/src/ev_select.c @@ -173,7 +173,7 @@ static void _do_poll(struct poller *p, int exp, int wake) delta_ms = wake ? 0 : compute_poll_timeout(exp); delta.tv_sec = (delta_ms / 1000); delta.tv_usec = (delta_ms % 1000) * 1000; - sched_entering_poll(); + clock_entering_poll(); activity_count_runtime(); status = select(maxfd, readnotnull ? tmp_evts[DIR_RD] : NULL, @@ -181,7 +181,7 @@ static void _do_poll(struct poller *p, int exp, int wake) NULL, &delta); clock_update_date(delta_ms, status); - sched_leaving_poll(delta_ms, status); + clock_leaving_poll(delta_ms, status); thread_harmless_end(); thread_idle_end(); diff --git a/src/stats.c b/src/stats.c index da6fc363a..bc083deac 100644 --- a/src/stats.c +++ b/src/stats.c @@ -3448,7 +3448,7 @@ static void stats_dump_html_info(struct stream_interface *si, struct uri_auth *u actconn, pipes_used, pipes_used+pipes_free, read_freq_ctr(&global.conn_per_sec), bps >= 1000000000UL ? (bps / 1000000000.0) : bps >= 1000000UL ? (bps / 1000000.0) : (bps / 1000.0), bps >= 1000000000UL ? 'G' : bps >= 1000000UL ? 'M' : 'k', - total_run_queues(), total_allocated_tasks(), sched_report_idle() + total_run_queues(), total_allocated_tasks(), clock_report_idle() ); /* scope_txt = search query, appctx->ctx.stats.scope_len is always <= STAT_SCOPE_TXT_MAXLEN */ @@ -4481,7 +4481,7 @@ int stats_fill_info(struct field *info, int len, uint flags) #endif info[INF_TASKS] = mkf_u32(0, total_allocated_tasks()); info[INF_RUN_QUEUE] = mkf_u32(0, total_run_queues()); - info[INF_IDLE_PCT] = mkf_u32(FN_AVG, sched_report_idle()); + info[INF_IDLE_PCT] = mkf_u32(FN_AVG, clock_report_idle()); info[INF_NODE] = mkf_str(FO_CONFIG|FN_OUTPUT|FS_SERVICE, global.node); if (global.desc) info[INF_DESCRIPTION] = mkf_str(FO_CONFIG|FN_OUTPUT|FS_SERVICE, global.desc); diff --git a/src/task.c b/src/task.c index d54735965..45111f138 100644 --- a/src/task.c +++ b/src/task.c @@ -38,10 +38,6 @@ DECLARE_POOL(pool_head_notification, "notification", sizeof(struct notification) volatile unsigned long global_tasks_mask = 0; /* Mask of threads with tasks in the global runqueue */ unsigned int niced_tasks = 0; /* number of niced tasks in the run queue */ -/* used for idle time calculation */ -THREAD_LOCAL unsigned int samp_time = 0; /* total elapsed time over current sample */ -THREAD_LOCAL unsigned int idle_time = 0; /* total idle time over current sample */ - THREAD_LOCAL struct task_per_thread *sched = &task_per_thread[0]; /* scheduler context for the current thread */ __decl_aligned_spinlock(rq_lock); /* spin lock related to run queue */ @@ -865,95 +861,6 @@ void process_runnable_tasks() activity[tid].long_rq++; } -/* report the average CPU idle percentage over all running threads, between 0 and 100 */ -uint sched_report_idle() -{ - uint total = 0; - uint rthr = 0; - uint thr; - - for (thr = 0; thr < MAX_THREADS; thr++) { - if (!(all_threads_mask & (1UL << thr))) - continue; - total += HA_ATOMIC_LOAD(&ha_thread_info[thr].idle_pct); - rthr++; - } - return rthr ? total / rthr : 0; -} - -/* Update the idle time value twice a second, to be called after - * clock_update_date() when called after poll(), and currently called only by - * sched_leaving_poll() below. It relies on to be updated to - * the system time before calling poll(). - */ -static inline void sched_measure_idle() -{ - /* Let's compute the idle to work ratio. We worked between after_poll - * and before_poll, and slept between before_poll and date. The idle_pct - * is updated at most twice every second. Note that the current second - * rarely changes so we avoid a multiply when not needed. - */ - int delta; - - if ((delta = date.tv_sec - before_poll.tv_sec)) - delta *= 1000000; - idle_time += delta + (date.tv_usec - before_poll.tv_usec); - - if ((delta = date.tv_sec - after_poll.tv_sec)) - delta *= 1000000; - samp_time += delta + (date.tv_usec - after_poll.tv_usec); - - after_poll.tv_sec = date.tv_sec; after_poll.tv_usec = date.tv_usec; - if (samp_time < 500000) - return; - - HA_ATOMIC_STORE(&ti->idle_pct, (100ULL * idle_time + samp_time / 2) / samp_time); - idle_time = samp_time = 0; -} - -/* Collect date and time information after leaving poll(). must be - * set to the maximum sleep time passed to poll (in milliseconds), and - * must be zero if the poller reached the timeout or non-zero - * otherwise, which generally is provided by the poller's return value. - */ -void sched_leaving_poll(int timeout, int interrupted) -{ - sched_measure_idle(); - ti->prev_cpu_time = now_cpu_time(); - ti->prev_mono_time = now_mono_time(); -} - -/* Collect date and time information before calling poll(). This will be used - * to count the run time of the past loop and the sleep time of the next poll. - * It also compares the elasped and cpu times during the activity period to - * estimate the amount of stolen time, which is reported if higher than half - * a millisecond. - */ -void sched_entering_poll() -{ - uint64_t new_mono_time; - uint64_t new_cpu_time; - int64_t stolen; - - gettimeofday(&before_poll, NULL); - - new_cpu_time = now_cpu_time(); - new_mono_time = now_mono_time(); - - if (ti->prev_cpu_time && ti->prev_mono_time) { - new_cpu_time -= ti->prev_cpu_time; - new_mono_time -= ti->prev_mono_time; - stolen = new_mono_time - new_cpu_time; - if (unlikely(stolen >= 500000)) { - stolen /= 500000; - /* more than half a millisecond difference might - * indicate an undesired preemption. - */ - report_stolen_time(stolen); - } - } -} - /* * Delete every tasks before running the master polling loop */