diff --git a/Makefile b/Makefile old mode 100755 new mode 100644 index d9adca9..f1e64cc --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ ifeq ($(THREAD_SUPPORT),yes) BENCH_CFLAGS += -ftree-vectorize endif -CLEANFILES := numad.o numad .depend .depend.X +CLEANFILES := numad.o numad .depend .depend.X empty.c empty.o SOURCES := numad.c @@ -31,7 +31,7 @@ docdir := ${prefix}/share/doc all: numad -numad: numad.o -lpthread +numad: numad.o -lpthread -lrt AR ?= ar RANLIB ?= ranlib @@ -42,6 +42,7 @@ RANLIB ?= ranlib install: numad mkdir -p ${prefix}/bin + mkdir -p ${prefix}/share/man/man8 install -m 0755 numad ${prefix}/bin install -m 0644 numad.8 ${prefix}/share/man/man8 @@ -62,4 +63,3 @@ include .depend Makefile: .depend - diff --git a/numad.8 b/numad.8 index e834a78..e497ea7 100644 --- a/numad.8 +++ b/numad.8 @@ -1,5 +1,5 @@ -.TH "numad" "8" "0.5.0" "Bill Gray" "Administration" -.SH "nuamd" +.TH "numad" "8" "1.0.0" "Bill Gray" "Administration" +.SH "numad" .LP numad \- A user\-level daemon that provides advice and managment for optimum use of CPUs and memory on systems with NUMA topology. .SH "SYNTAX" @@ -30,6 +30,9 @@ numad [\fI\-u target_utilization\fP] .LP numad [\fI\-w NCPUS[:MB]\fP] .br +.LP +numad [\fI\-x PID\fP] +.br .SH "DESCRIPTION" .LP @@ -49,7 +52,7 @@ This option can be used to communicate a non-standard cgroup mount point to numad. This is not normally necessary. .TP \fB\-h\fR -Display usage help information and exit. +Display usage help information and then exit. .TP \fB\-i\fR <\fI[min_interval:]max_interval\fP> Sets the time interval that numad waits between system scans, in seconds to @@ -60,31 +63,30 @@ daemon.) A bigger <\fImax_interval\fP> will decrease numad overhead but also decrease responsiveness to changing loads. .TP \fB\-l\fR <\fIlog_level\fP> -Sets the log level to <\fIlog_level\fP>. Good choices are 5, 6, or 7. The default value is 5. +Sets the log level to <\fIlog_level\fP>. Reasonable choices are 5, 6, or 7. +The default value is 5. .TP \fB\-p\fR <\fIPID\fP> -Add PID to explicit list of processes to consider for managing, if the process -uses significant resources. Multiple \fI\-p PID\fP options can be specified at -daemon start, but after deamon start, only one PID can be added to the explicit -list per subsequent numad invocation. Use with \-S to precisely control the -scope of processes numad can manage. Note that (currently) the specified -process will not necessarily be actively managed unless it also meets numad's -significance threshold. +Add PID to explicit inclusion list of processes to consider for managing, if +the process also uses significant resources. Multiple \fI\-p PID\fP options +can be specified at daemon start, but after deamon start, only one PID can be +added to the inclusion list per subsequent numad invocation. Use with \-S to +precisely control the scope of processes numad can manage. Note that the +specified process will not necessarily be actively managed unless it also meets +numad's significance threshold -- which is currently 300MB and half a CPU. .TP \fB\-r\fR <\fIPID\fP> -Remove PID from explicit list of processes to consider for managing. After -deamon start, only one PID can be removed from the explicit process list per -subsequent numad invocation. Use with \-S and \-p to precisely control the -scope of processes numad can manage. (Currently, removed PIDs are not excluded -when scanning all processes. In the future, removed PIDs may be added to a -PID-exclusion list. Currently it makes sense to remove a PID from the explicit -list only when using \fI-S 0\fP and previously having used \-p to add the PID.) +Remove PID from both the explicit inclusion and the exclusion lists of +processes. After deamon start, only one PID can be removed from the explicit +process lists per subsequent numad invocation. Use with \-S and \-p and \-x to +precisely control the scope of processes numad can manage. .TP \fB\-S\fR <\fI0|1\fP> This option controls whether numad scans all system processes or only the -explicit PID list. The default is to scan all processes. Use \fI\-S 0\fP to -scan only the explicit PID list. Use \fI\-S 1\fP to again scan all system -processes. Starting numad as +processes on the explicit inclusion PID list. The default is to scan all +processes. Use \fI\-S 0\fP to scan only the explicit inclusion PID list. Use +\fI\-S 1\fP to again scan all system processes (excepting those on the explicit +exclusion list). Starting numad as .br \fInumad \-S 0 \-p \-p \-p \fP .br @@ -93,8 +95,8 @@ three explicitly specified processes. .TP \fB\-u\fR <\fItarget_utilization\fP> Set the desired maximum consumption percentage of a node. Default is 85%. -Decrease the target value to maintain more available resource margin. Increase -the target value to more exhaustively consume node resources. +Decrease the target value to maintain more available resource margin on each +node. Increase the target value to more exhaustively consume node resources. .TP \fB\-v\fR Verbose output in log, sets the log level to LOG_INFO. Same effect as \fI\-l 6\fP. @@ -104,18 +106,25 @@ Display version information and exit. .TP \fB\-w\fR <\fINCPUS[:MB]\fP> Queries numad for the best NUMA nodes to bind an entity that needs -<\fINCPUS\fP>. The amount of memory (in MBs) is optional, but should be -specified as well <\fI:MB\fP> so numad can recommend NUMA nodes with available -CPU capacity and adequate free memory. This query option can be used +<\fINCPUS\fP>. The amount of memory (in MBs) is optional, but should normally +be specified as well <\fI:MB\fP> so numad can recommend NUMA nodes with +available CPU capacity and adequate free memory. This query option can be used regardless of whether numad is running as a daemon. (An invocation using this option when numad is not running as a daemon, will not cause the deamon to -start.) Output of this option is a string that contains a NUMA node list. -For example: 2\-3,6. The recommended node list could be saved in a shell variable +start.) Output of this option is a string that contains a NUMA node list. For +example: 2\-3,6. The recommended node list could be saved in a shell variable (e.g., NODES) and then used as the node list parameter in a .br \fInumactl \-m $NODES \-N $NODES ... \fP .br command. See numactl(8). +.TP +\fB\-x\fR <\fIPID\fP> +Add PID to explicit exclusion list of processes to blacklist from managing. +Multiple \fI\-x PID\fP options can be specified at daemon start, but after +deamon start, only one PID can be added to the exclusion list per subsequent +numad invocation. Use with \-S to precisely control the scope of processes +numad can manage. .SH "FILES" .LP \fI/usr/bin/numad\fP diff --git a/numad.c b/numad.c index 8146536..7857d8c 100644 --- a/numad.c +++ b/numad.c @@ -19,32 +19,7 @@ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ - -// Compile with: gcc -O -std=gnu99 -pthread -o numad numad.c - - - - -#if 0 -/* - -TODO: -===== - -- fix issues -- verify args -- inclusion / exclusion lists, specific PIDs only, per-process ???force??? flags -- add option to self-calculate SLIT distance info -- extend dynamic intervals? -- hi / lo watermarks, and ring buffer for processor cpu_used -- write some internal documentation, explaining purposes and design -- add a lazy thread that slowly cleans up obsolete cpusets -- testing and verification - -*/ -#endif - - +// Compile with: gcc -O -std=gnu99 -Wall -pthread -o numad numad.c -lrt #define _GNU_SOURCE @@ -79,9 +54,7 @@ TODO: #include - -#define VERSION_STRING "20120522b" - +#define VERSION_STRING "20121015" #define VAR_RUN_FILE "/var/run/numad.pid" @@ -96,30 +69,23 @@ char *cpuset_dir_list[] = { }; - -#define QEMU_CPUSET_SUBDIR "libvirt/qemu" - - - -#define MAX_NODES 256 - -#define MAX_CPUS 2048 - - #define KILOBYTE (1024) #define MEGABYTE (1024 * 1024) -#define GIGABYTE (1024 * 1024 * 1024) -#define BUF_SIZE 1024 #define FNAME_SIZE 192 +#define BUF_SIZE 1024 +#define BIG_BUF_SIZE 4096 -#define CPU_SCALE_FACTOR 100 +// The ONE_HUNDRED factor is used to scale time and CPU usage units. +// Several CPU quantities are measured in percents of a CPU; and +// several time values are counted in hundreths of a second. +#define ONE_HUNDRED 100 -#define CPU_THRESHOLD 50 -#define MEMORY_THRESHOLD 300 #define MIN_INTERVAL 5 #define MAX_INTERVAL 15 +#define CPU_THRESHOLD 50 +#define MEMORY_THRESHOLD 300 #define TARGET_UTILIZATION_PERCENT 85 #define IMPROVEMENT_THRESHOLD_PERCENT 5 @@ -139,7 +105,6 @@ char *cpuset_dir_list[] = { int num_cpus = 0; int num_nodes = 0; -int cur_cpu_data_buf = 0; int page_size_in_bytes = 0; int huge_page_size_in_bytes = 0; @@ -153,6 +118,8 @@ pthread_mutex_t node_info_mutex; int requested_mbs = 0; int requested_cpus = 0; + + FILE *log_fs = NULL; int log_level = LOG_NOTICE; @@ -199,57 +166,8 @@ void close_log_file() { } -typedef struct pid_list_node { - long pid; - struct pid_list_node* next; -} pid_list_node_t, *pid_list_node_p; -pid_list_node_p pid_list = NULL; - -void insert_pid_into_pid_list(long pid) { - // Check for duplicate pid first - pid_list_node_p pid_ptr = pid_list; - while (pid_ptr != NULL) { - if (pid_ptr->pid == pid) { - return; // pid already in list - } - pid_ptr = pid_ptr->next; - } - // pid not yet in list -- insert new node - pid_ptr = malloc(sizeof(pid_list_node_t)); - if (pid_ptr == NULL) { - numad_log(LOG_CRIT, "pid_list malloc failed\n"); - exit(EXIT_FAILURE); - } - pid_ptr->pid = pid; - pid_ptr->next = pid_list; - pid_list = pid_ptr; -} - -void remove_pid_from_pid_list(long pid) { - pid_list_node_p last_pid_ptr = NULL; - pid_list_node_p pid_ptr = pid_list; - while (pid_ptr != NULL) { - if (pid_ptr->pid == pid) { - if (pid_ptr == pid_list) { - pid_list = pid_list->next; - free(pid_ptr); - pid_ptr = pid_list; - continue; - } else { - last_pid_ptr->next = pid_ptr->next; - free(pid_ptr); - pid_ptr = last_pid_ptr; - } - } - last_pid_ptr = pid_ptr; - pid_ptr = pid_ptr->next; - } -} - - - -#define MSG_BODY_TEXT_SIZE 48 +#define MSG_BODY_TEXT_SIZE 96 typedef struct msg_body { long src_pid; @@ -299,9 +217,14 @@ void send_msg(long dst_pid, long cmd, long arg1, long arg2, char *s) { msg.body.cmd = cmd; msg.body.arg1 = arg1; msg.body.arg2 = arg2; + int s_len = strlen(s); + if (s_len >= MSG_BODY_TEXT_SIZE) { + numad_log(LOG_CRIT, "msgsnd text too big\n"); + exit(EXIT_FAILURE); + } strcpy(msg.body.text, s); - size_t len = sizeof(msg_body_t) - MSG_BODY_TEXT_SIZE + strlen(msg.body.text) + 1; - if (msgsnd(msg_qid, &msg, len, IPC_NOWAIT) < 0) { + size_t m_len = sizeof(msg_body_t) - MSG_BODY_TEXT_SIZE + s_len + 1; + if (msgsnd(msg_qid, &msg, m_len, IPC_NOWAIT) < 0) { numad_log(LOG_CRIT, "msgsnd failed\n"); exit(EXIT_FAILURE); } @@ -310,44 +233,6 @@ void send_msg(long dst_pid, long cmd, long arg1, long arg2, char *s) { -void shut_down_numad() { - numad_log(LOG_NOTICE, "Shutting down numad\n"); - flush_msg_queue(); - unlink(VAR_RUN_FILE); - close_log_file(); - exit(EXIT_SUCCESS); -} - - -void print_version_and_exit(char *prog_name) { - fprintf(stdout, "%s version: %s: compiled %s\n", prog_name, VERSION_STRING, __DATE__); - exit(EXIT_SUCCESS); -} - - -void print_usage_and_exit(char *prog_name) { - fprintf(stderr, "Usage: %s ...\n", prog_name); - fprintf(stderr, "-d for debug logging (same effect as '-l 7'\n"); - fprintf(stderr, "-D to specify cgroup mount point\n"); - fprintf(stderr, "-h to print this usage info\n"); - fprintf(stderr, "-i [:] to specify interval seconds\n"); - fprintf(stderr, "-l to specify logging level (usually 5, 6, or 7)\n"); - fprintf(stderr, "-p to add PID to explicit pid list\n"); - fprintf(stderr, "-r to remove PID from explicit pid list\n"); - fprintf(stderr, "-S 1 to scan all processes\n"); - fprintf(stderr, "-S 0 to scan only explicit PID list processes\n"); - fprintf(stderr, "-u to specify target utilization percent (default 85)\n"); - fprintf(stderr, "-v for verbose (same effect as '-l 6'\n"); - fprintf(stderr, "-V to show version info\n"); - fprintf(stderr, "-w [:] for NUMA node suggestions\n"); - exit(EXIT_FAILURE); -} - - - - - - typedef struct id_list { // Use CPU_SET(3) cpuset bitmasks, // but bundle size and pointer together @@ -399,7 +284,7 @@ int add_ids_to_list_from_str(id_list_p list_p, char *s) { for (;;) { // skip over non-digits while (!isdigit(*s)) { - if (*s == '\0') { + if ((*s == '\n') || (*s == '\0')) { goto return_list; } if (*s++ == '-') { @@ -455,141 +340,55 @@ terminate_string: } -id_list_p all_cpus_list_p; -id_list_p all_nodes_list_p; - - - - - typedef struct node_data { uint64_t node_id; uint64_t MBs_total; uint64_t MBs_free; - uint64_t CPUs_total; // scaled * CPU_SCALE_FACTOR - uint64_t CPUs_free; // scaled * CPU_SCALE_FACTOR + uint64_t CPUs_total; // scaled * ONE_HUNDRED + uint64_t CPUs_free; // scaled * ONE_HUNDRED uint64_t magnitude; // hack: MBs * CPUs - uint8_t distance[MAX_NODES]; + uint8_t *distance; id_list_p cpu_list_p; } node_data_t, *node_data_p; -node_data_t node[MAX_NODES]; +node_data_p node = NULL; -int node_from_cpu[MAX_CPUS]; +// RING_BUF_SIZE must be a power of two +#define RING_BUF_SIZE 8 - -typedef struct per_cpu_stats_data { - // Statistics from /proc/stat cpu lines - // (assuming at least 2.6.24 kernel) - uint64_t user; - uint64_t nice; - uint64_t system; - uint64_t idle; - uint64_t iowait; - uint64_t irq; - uint64_t softirq; - uint64_t steal; - uint64_t guest; -} per_cpu_stats_data_t, *per_cpu_stats_data_p; - -typedef struct cpu_data { - // stats from /proc/uptime - uint64_t uptime; // scaled * CPU_SCALE_FACTOR - uint64_t sys_idle; // scaled * CPU_SCALE_FACTOR - // stats from /proc/stat, cpu lines - per_cpu_stats_data_t cpu_stats[MAX_CPUS]; -} cpu_data_t, *cpu_data_p; - -cpu_data_t cpu_data_buf[2]; // Two sets, to calc deltas - - -typedef struct stat_data { - int pid; - char *comm; - int state; - int ppid; - int pgrp; - int session; - int tty_nr; - int tpgid; - unsigned flags; - uint64_t minflt; - uint64_t cminflt; - uint64_t majflt; - uint64_t cmajflt; - uint64_t utime; - uint64_t stime; - int64_t cutime; - int64_t cstime; - int64_t priority; - int64_t nice; - int64_t num_threads; - int64_t itrealvalue; - uint64_t starttime; - uint64_t vsize; - int64_t rss; - uint64_t rsslim; - uint64_t startcode; - uint64_t endcode; - uint64_t startstack; - uint64_t kstkesp; - uint64_t kstkeip; - uint64_t signal; - uint64_t blocked; - uint64_t sigignore; - uint64_t sigcatch; - uint64_t wchan; - uint64_t nswap; - uint64_t cnswap; - int exit_signal; - int processor; - unsigned rt_priority; - unsigned policy; - uint64_t delayacct_blkio_ticks; - uint64_t guest_time; - int64_t cguest_time; -} stat_data_t, *stat_data_p; - -#define RING_BUF_SIZE 16 -#define RING_BUF_MASK 0xf +#define PROCESS_FLAG_INTERLEAVED (1 << 0) typedef struct process_data { - // Most process stats are derived from /proc//stat info -- subset of - // stat_data above. Currently use only about half of these... int pid; - // int pgrp; - char *comm; - uint64_t utime; - uint64_t stime; - // int64_t priority; - // int64_t nice; - int64_t num_threads; - // uint64_t starttime; - // uint64_t vsize_MBs; - uint64_t rss_MBs; - int processor; - // unsigned rt_priority; - // unsigned policy; - // uint64_t guest_time; - uint64_t uptime; // Data timestamp from /proc/uptime - char *cpuset_name; + unsigned int flags; + uint64_t data_time_stamp; // hundredths of seconds + uint64_t bind_time_stamp; + uint64_t num_threads; uint64_t MBs_used; - uint64_t CPUs_used; + uint64_t cpu_util; + uint64_t CPUs_used; // scaled * ONE_HUNDRED uint64_t CPUs_used_ring_buf[RING_BUF_SIZE]; int ring_buf_ix; int dup_bind_count; - // id_list_p node_list_p; + char *comm; + char *cpuset_name; } process_data_t, *process_data_p; -#define PROCESS_HASH_TABLE_SIZE 2003 -process_data_t process_hash_table[PROCESS_HASH_TABLE_SIZE]; + +// Hash table size must always be a power of two +#define MIN_PROCESS_HASH_TABLE_SIZE 64 +int process_hash_table_size = 0; int process_hash_collisions = 0; - +process_data_p process_hash_table = NULL; int process_hash_ix(int pid) { - return (pid % PROCESS_HASH_TABLE_SIZE); + unsigned ix = pid; + ix *= 717; + ix >>= 8; + ix &= (process_hash_table_size - 1); + return ix; } int process_hash_lookup(int pid) { @@ -601,11 +400,11 @@ int process_hash_lookup(int pid) { return ix; // found it } ix += 1; - if (ix >= PROCESS_HASH_TABLE_SIZE) { - ix = 0; - } + ix &= (process_hash_table_size - 1); if (ix == starting_ix) { - break; // table full and pid not found + // Table full and pid not found. + // This "should never happen"... + break; } } return -1; @@ -619,15 +418,12 @@ int process_hash_insert(int pid) { if (pid == process_hash_table[ix].pid) { return ix; // found it } - ix += 1; - if (ix >= PROCESS_HASH_TABLE_SIZE) { - ix = 0; - } process_hash_collisions += 1; + ix += 1; + ix &= (process_hash_table_size - 1); if (ix == starting_ix) { + // This "should never happen"... numad_log(LOG_ERR, "Process hash table is full\n"); - // FIXME: do something here, or preferrably much sooner, - // perhaps when some collisions threshold is passed return -1; } } @@ -637,17 +433,17 @@ int process_hash_insert(int pid) { int process_hash_update(process_data_p newp) { // This updates hash table stats for processes we are monitoring + int new_hash_table_entry = 1; int ix = process_hash_insert(newp->pid); if (ix >= 0) { process_data_p p = &process_hash_table[ix]; - p->MBs_used = newp->rss_MBs; - if (p->uptime) { + if (p->data_time_stamp) { + new_hash_table_entry = 0; p->ring_buf_ix += 1; - p->ring_buf_ix &= RING_BUF_MASK; - uint64_t utime_diff = newp->utime - p->utime; - uint64_t stime_diff = newp->stime - p->stime; - uint64_t uptime_diff = newp->uptime - p->uptime; - p->CPUs_used_ring_buf[p->ring_buf_ix] = 100 * (utime_diff + stime_diff) / uptime_diff; + p->ring_buf_ix &= (RING_BUF_SIZE - 1); + uint64_t cpu_util_diff = newp->cpu_util - p->cpu_util; + uint64_t time_diff = newp->data_time_stamp - p->data_time_stamp; + p->CPUs_used_ring_buf[p->ring_buf_ix] = 100 * (cpu_util_diff) / time_diff; // Use largest CPU utilization currently in ring buffer uint64_t max_CPUs_used = p->CPUs_used_ring_buf[0]; for (int ix = 1; (ix < RING_BUF_SIZE); ix++) { @@ -663,25 +459,14 @@ int process_hash_update(process_data_p newp) { } p->comm = strdup(newp->comm); } - // p->pgrp = newp->pgrp; - // p->nice = newp->nice; - // p->policy = newp->policy; - // p->priority = newp->priority; - p->processor = newp->processor; - p->rss_MBs = newp->rss_MBs; - // p->vsize_MBs = newp->vsize_MBs; - // p->rt_priority = newp->rt_priority; - // p->starttime = newp->starttime; - p->stime = newp->stime; + p->MBs_used = newp->MBs_used; + p->cpu_util = newp->cpu_util; p->num_threads = newp->num_threads; - p->uptime = newp->uptime; - p->utime = newp->utime; - // p->guest_time = newp->guest_time; + p->data_time_stamp = newp->data_time_stamp; } - return ix; + return new_hash_table_entry; } - int process_hash_rehash(int old_ix) { // Given the index of a table entry that would otherwise be orphaned by // process_hash_remove(), reinsert into table using PID from existing record. @@ -706,7 +491,7 @@ int process_hash_remove(int pid) { // if (dp->node_list_p) { FREE_LIST(dp->node_list_p); } memset(dp, 0, sizeof(process_data_t)); // bubble up the collision chain - while (pid = process_hash_table[++ix].pid) { + while ((pid = process_hash_table[++ix].pid) > 0) { if (process_hash_lookup(pid) < 0) { if (process_hash_rehash(ix) < 0) { numad_log(LOG_ERR, "rehash fail\n"); @@ -717,17 +502,183 @@ int process_hash_remove(int pid) { return ix; } +void process_hash_table_expand() { + // Save old table size and address + int old_size = process_hash_table_size; + process_data_p old_table = process_hash_table; + // Double size of table and allocate new space + if (old_size > 0) { + process_hash_table_size *= 2; + } else { + process_hash_table_size = MIN_PROCESS_HASH_TABLE_SIZE; + } + process_hash_table = malloc(process_hash_table_size * sizeof(process_data_t)); + if (process_hash_table == NULL) { + numad_log(LOG_CRIT, "hash table malloc failed\n"); + exit(EXIT_FAILURE); + } + // Clear the new table, and copy valid entries from old table + memset(process_hash_table, 0, process_hash_table_size * sizeof(process_data_t)); + for (int ix = 0; (ix < old_size); ix++) { + process_data_p p = &old_table[ix]; + if (p->pid) { + int new_table_ix = process_hash_insert(p->pid); + memcpy(&process_hash_table[new_table_ix], p, sizeof(process_data_t)); + } + } + if (old_table != NULL) { + free(old_table); + } +} + +void process_hash_table_cleanup(uint64_t update_time) { + int cpusets_removed = 0; + int num_hash_entries_used = 0; + for (int ix = 0; (ix < process_hash_table_size); ix++) { + process_data_p p = &process_hash_table[ix]; + if (p->pid) { + num_hash_entries_used += 1; + if (p->data_time_stamp < update_time) { + // Mark as old, and zero CPU utilization + p->data_time_stamp = 0; + p->CPUs_used = 0; + // Check for dead pids and remove them... + char fname[FNAME_SIZE]; + snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid); + if (access(fname, F_OK) < 0) { + // Seems dead. Forget this pid -- after first checking + // and removing obsolete numad.PID cpuset directories. + snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid); + if (access(fname, F_OK) == 0) { + numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname); + int rc = rmdir(fname); + if (rc >= 0) { + cpusets_removed += 1; + } else { + numad_log(LOG_ERR, "bad cpuset rmdir\n"); + // exit(EXIT_FAILURE); + } + } + process_hash_remove(p->pid); + num_hash_entries_used -= 1; + } + } + } + } + if (cpusets_removed > 0) { + // Expire all the duplicate bind counts so things will be re-evaluated sooner. + for (int ix = 0; (ix < process_hash_table_size); ix++) { + process_hash_table[ix].dup_bind_count = 0; + } + } + // Keep hash table approximately half empty + if ((num_hash_entries_used * 7) / 4 > process_hash_table_size) { + process_hash_table_expand(); + } +} +typedef struct pid_list { + long pid; + struct pid_list* next; +} pid_list_t, *pid_list_p; + +pid_list_p include_pid_list = NULL; +pid_list_p exclude_pid_list = NULL; + +pid_list_p insert_pid_into_pid_list(pid_list_p list_ptr, long pid) { + if (process_hash_table != NULL) { + int hash_ix = process_hash_lookup(pid); + if ((hash_ix >= 0) && (list_ptr == include_pid_list)) { + // Clear dup_bind_count, in case user wants it to be re-evaluated soon + process_hash_table[hash_ix].dup_bind_count = 0; + } + } + // Check for duplicate pid first + pid_list_p pid_ptr = list_ptr; + while (pid_ptr != NULL) { + if (pid_ptr->pid == pid) { + // pid already in list + return list_ptr; + } + pid_ptr = pid_ptr->next; + } + // pid not yet in list -- insert new node + pid_ptr = malloc(sizeof(pid_list_t)); + if (pid_ptr == NULL) { + numad_log(LOG_CRIT, "pid_list malloc failed\n"); + exit(EXIT_FAILURE); + } + pid_ptr->pid = pid; + pid_ptr->next = list_ptr; + list_ptr = pid_ptr; + return list_ptr; +} + +pid_list_p remove_pid_from_pid_list(pid_list_p list_ptr, long pid) { + pid_list_p last_pid_ptr = NULL; + pid_list_p pid_ptr = list_ptr; + while (pid_ptr != NULL) { + if (pid_ptr->pid == pid) { + if (pid_ptr == list_ptr) { + list_ptr = list_ptr->next; + free(pid_ptr); + pid_ptr = list_ptr; + continue; + } else { + last_pid_ptr->next = pid_ptr->next; + free(pid_ptr); + pid_ptr = last_pid_ptr; + } + } + last_pid_ptr = pid_ptr; + pid_ptr = pid_ptr->next; + } + return list_ptr; +} +void shut_down_numad() { + numad_log(LOG_NOTICE, "Shutting down numad\n"); + flush_msg_queue(); + unlink(VAR_RUN_FILE); + close_log_file(); + exit(EXIT_SUCCESS); +} + + +void print_version_and_exit(char *prog_name) { + fprintf(stdout, "%s version: %s: compiled %s\n", prog_name, VERSION_STRING, __DATE__); + exit(EXIT_SUCCESS); +} + + +void print_usage_and_exit(char *prog_name) { + fprintf(stderr, "Usage: %s ...\n", prog_name); + fprintf(stderr, "-d for debug logging (same effect as '-l 7')\n"); + fprintf(stderr, "-D to specify cgroup mount point\n"); + fprintf(stderr, "-h to print this usage info\n"); + fprintf(stderr, "-i [:] to specify interval seconds\n"); + fprintf(stderr, "-l to specify logging level (usually 5, 6, or 7)\n"); + fprintf(stderr, "-p to add PID to inclusion pid list\n"); + fprintf(stderr, "-r to remove PID from explicit pid lists\n"); + fprintf(stderr, "-S 1 to scan all processes\n"); + fprintf(stderr, "-S 0 to scan only explicit PID list processes\n"); + fprintf(stderr, "-u to specify target utilization percent (default 85)\n"); + fprintf(stderr, "-v for verbose (same effect as '-l 6')\n"); + fprintf(stderr, "-V to show version info\n"); + fprintf(stderr, "-w [:] for NUMA node suggestions\n"); + fprintf(stderr, "-x to add PID to exclusion pid list\n"); + exit(EXIT_FAILURE); +} + void check_prereqs(char *prog_name) { // Verify cpusets are available on this system. char **dir = &cpuset_dir_list[0]; - if (*dir == NULL) { *dir++; } + if (*dir == NULL) { dir++; } while (*dir != NULL) { cpuset_dir = *dir; char fname[FNAME_SIZE]; @@ -735,7 +686,7 @@ void check_prereqs(char *prog_name) { if (access(fname, F_OK) == 0) { break; } - *dir++; + dir++; } if (*dir == NULL) { fprintf(stderr, "\n"); @@ -746,10 +697,10 @@ void check_prereqs(char *prog_name) { fprintf(stderr, " # mount cgroup -t cgroup -o cpuset \n"); fprintf(stderr, " where is something like:\n"); dir = &cpuset_dir_list[0]; - if (*dir == NULL) { *dir++; } + if (*dir == NULL) { dir++; } while (*dir != NULL) { fprintf(stderr, " - %s\n", *dir); - *dir++; + dir++; } fprintf(stderr, "and then try again...\n"); fprintf(stderr, "Or, use '-D ' to specify the correct mount point\n"); @@ -767,22 +718,19 @@ void check_prereqs(char *prog_name) { if (bytes > 0) { char *p = buf; CONVERT_DIGITS_TO_NUM(p, ms); - } - if (ms > 150) { - fprintf(stderr, "\n"); - numad_log(LOG_NOTICE, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms); - fprintf(stderr, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms); - fprintf(stderr, "Consider increasing the frequency of THP scanning,\n"); - fprintf(stderr, "by echoing a smaller number (e.g. 100) to %s\n", thp_scan_fname); - fprintf(stderr, "to more agressively (re)construct THPs. For example:\n"); - fprintf(stderr, "# echo 100 > /sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs\n"); - fprintf(stderr, "\n"); - // For a similar tool, IBM monitored the cpu utilization of the khugepaged - // kernel thread using a pidstat like utility and saw very low utilization - // while running with this configuration (0.05%) while maintaining a high - // ratio of anonymous huge pages / anonymous pages (99%). + if (ms > 150) { + fprintf(stderr, "\n"); + numad_log(LOG_NOTICE, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms); + fprintf(stderr, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms); + fprintf(stderr, "Consider increasing the frequency of THP scanning,\n"); + fprintf(stderr, "by echoing a smaller number (e.g. 100) to %s\n", thp_scan_fname); + fprintf(stderr, "to more agressively (re)construct THPs. For example:\n"); + fprintf(stderr, "# echo 100 > /sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs\n"); + fprintf(stderr, "\n"); + } } } + // FIXME: ?? check for enabled ksmd, and recommend disabling ksm? } @@ -794,23 +742,24 @@ int get_daemon_pid() { char buf[BUF_SIZE]; int bytes = read(fd, buf, BUF_SIZE); close(fd); - if (bytes > 0) { - int pid; - char *p = buf; - CONVERT_DIGITS_TO_NUM(p, pid); - // Check run file pid still active - char fname[FNAME_SIZE]; - snprintf(fname, FNAME_SIZE, "/proc/%d", pid); - if (access(fname, F_OK) < 0) { - if (errno == ENOENT) { - numad_log(LOG_NOTICE, "Removing out-of-date numad run file because %s doesn't exist\n", fname); - unlink(VAR_RUN_FILE); - } - return 0; - } - // Daemon must be running already. - return pid; + if (bytes <= 0) { + return 0; } + int pid; + char *p = buf; + CONVERT_DIGITS_TO_NUM(p, pid); + // Check run file pid still active + char fname[FNAME_SIZE]; + snprintf(fname, FNAME_SIZE, "/proc/%d", pid); + if (access(fname, F_OK) < 0) { + if (errno == ENOENT) { + numad_log(LOG_NOTICE, "Removing out-of-date numad run file because %s doesn't exist\n", fname); + unlink(VAR_RUN_FILE); + } + return 0; + } + // Daemon must be running already. + return pid; } @@ -859,22 +808,48 @@ fail_numad_run_file: } -static __inline__ uint64_t rdtsc() { - unsigned hi, lo; - // __asm__ __volatile__ ( "cpuid" ); - // ?? use rdtscp on i7 (and later), instead of previous cpuid - __asm__ __volatile__ ( "rdtsc" : "=a"(lo), "=d"(hi)); - return ( (uint64_t)lo) | ( ((uint64_t)hi) << 32); +int get_num_cpus() { + int n1 = sysconf(_SC_NPROCESSORS_CONF); + int n2 = sysconf(_SC_NPROCESSORS_ONLN); + if (n1 < n2) { + n1 = n2; + } + if (n1 < 0) { + numad_log(LOG_CRIT, "Cannot count number of processors\n"); + exit(EXIT_FAILURE); + } + return n1; } -int get_num_cpus() { - int n = sysconf(_SC_NPROCESSORS_CONF); - int n2 = sysconf(_SC_NPROCESSORS_ONLN); - if (n < n2) { - n = n2; +int get_num_kvm_vcpu_threads(int pid) { + // Try to return the number of vCPU threads for this VM guest, + // excluding the IO threads. All failures return MAXINT. + // FIXME: figure out some better way to do this... + char fname[FNAME_SIZE]; + snprintf(fname, FNAME_SIZE, "/proc/%d/cmdline", pid); + int fd = open(fname, O_RDONLY, 0); + if (fd >= 0) { + char buf[BUF_SIZE]; + int bytes = read(fd, buf, BUF_SIZE); + close(fd); + if (bytes > 0) { + char *p = memmem(buf, bytes, "smp", 3); + if (p != NULL) { + while (!isdigit(*p) && (p - buf < bytes - 2)) { + p++; + } + if (isdigit(*p)) { + int vcpu_threads; + CONVERT_DIGITS_TO_NUM(p, vcpu_threads); + if ((vcpu_threads > 0) && (vcpu_threads <= num_cpus)) { + return vcpu_threads; + } + } + } + } } - return n; + return MAXINT; } @@ -901,65 +876,23 @@ int get_huge_page_size_in_bytes() { } -uint64_t get_uptime(uint64_t *idle_ptr) { - assert(sysconf(_SC_CLK_TCK) == CPU_SCALE_FACTOR); - // Return scaled system uptime info from /proc/uptime - // If non-null idle_ptr provided, get system idle time too. - static int fd = -1; - if (fd >= 0) { - // rewind the file - lseek(fd, 0, SEEK_SET); - } else { - fd = open("/proc/uptime", O_RDONLY, 0); - if (fd < 0) { - goto fail_proc_uptime; - } +uint64_t get_time_stamp() { + // Return time stamp in hundredths of a second + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) { + numad_log(LOG_CRIT, "Cannot get clock_gettime()\n"); + exit(EXIT_FAILURE); } - char buf[BUF_SIZE]; - int bytes = read(fd, buf, BUF_SIZE); - if (bytes <= 0) { - goto fail_proc_uptime; - } - char *p = buf; - uint64_t tmpul[2]; - // Parse one or two numbers depending on whether or not caller wants system idle - for (int ix = 0; (ix < (1 + (idle_ptr != NULL))); ix++) { - // skip over non-digits - while (!isdigit(*p)) { - if (*p++ == '\0') { - goto fail_proc_uptime; - } - } - // convert consecutive digits to a number, ignoring decimal point thus scaling by 100 - tmpul[ix] = *p++ - '0'; - while (isdigit(*p)) { - tmpul[ix] *= 10; - tmpul[ix] += (*p++ - '0'); - if (*p == '.') { - p++; // Skip decimal point - } - } - } - if (idle_ptr != NULL) { - *idle_ptr = tmpul[1]; - } - return tmpul[0]; -fail_proc_uptime: - numad_log(LOG_CRIT, "Cannot get /proc/uptime contents\n"); - exit(EXIT_FAILURE); + return (ts.tv_sec * ONE_HUNDRED) + + (ts.tv_nsec / (1000000000 / ONE_HUNDRED)); } - static int name_starts_with_digit(const struct dirent *dptr) { return (isdigit(dptr->d_name[0])); } - - - - int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_list_p, id_list_p cpu_list_p) { // Check basic parameter validity. if (pid <= 0) { @@ -1007,7 +940,7 @@ int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_l cpuset_name = cpuset_name_buf; // Now that we have a cpuset for pid and a populated cpulist, // start the actual binding and migration. - uint64_t t0 = get_uptime(NULL); + uint64_t t0 = get_time_stamp(); // Write "1" out to cpuset.memory_migrate file char fname[FNAME_SIZE]; @@ -1062,11 +995,12 @@ int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_l // copy pid tasks, one at a time numad_log(LOG_NOTICE, "Including task: %s\n", namelist[ix]->d_name); write(fd, namelist[ix]->d_name, strlen(namelist[ix]->d_name)); + free(namelist[ix]); } free(namelist); close(fd); - uint64_t t1 = get_uptime(NULL); + uint64_t t1 = get_time_stamp(); // Check pid still active snprintf(fname, FNAME_SIZE, "/proc/%d", pid); if (access(fname, F_OK) < 0) { @@ -1078,16 +1012,12 @@ int bind_process_and_migrate_memory(int pid, char *cpuset_name, id_list_p node_l } - - - - void show_nodes() { time_t ts = time(NULL); fprintf(log_fs, "%s", ctime(&ts)); fprintf(log_fs, "Nodes: %d\n", num_nodes); for (int ix = 0; (ix < num_nodes); ix++) { - fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %ld, CPUs_total %ld, CPUs_free %ld, Distance: ", + fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld, Distance: ", ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free); for (int d = 0; (d < num_nodes); d++) { fprintf(log_fs, "%d ", node[ix].distance[d]); @@ -1101,176 +1031,235 @@ void show_nodes() { } -int update_cpu_data() { - char buf[BUF_SIZE]; - FILE *fs = fopen("/proc/stat", "r"); - if (!fs) { - goto fail_proc_stat; +typedef struct cpu_data { + uint64_t time_stamp; + uint64_t *idle; +} cpu_data_t, *cpu_data_p; + +cpu_data_t cpu_data_buf[2]; // Two sets, to calc deltas +int cur_cpu_data_buf = 0; + + +void update_cpu_data() { + // Parse idle percents from CPU stats in /proc/stat cpu lines + static FILE *fs = NULL; + if (fs != NULL) { + rewind(fs); + } else { + fs = fopen("/proc/stat", "r"); + if (!fs) { + numad_log(LOG_CRIT, "Cannot get /proc/stat contents\n"); + exit(EXIT_FAILURE); + } + cpu_data_buf[0].idle = malloc(num_cpus * sizeof(uint64_t)); + cpu_data_buf[1].idle = malloc(num_cpus * sizeof(uint64_t)); + if ((cpu_data_buf[0].idle == NULL) || (cpu_data_buf[1].idle == NULL)) { + numad_log(LOG_CRIT, "cpu_data_buf malloc failed\n"); + exit(EXIT_FAILURE); + } } + // Use the other cpu_data buffer... int new = 1 - cur_cpu_data_buf; - // First get new uptime and idle time - cpu_data_buf[new].uptime = get_uptime(&cpu_data_buf[new].sys_idle); - // Now get stats for each cpu + // First get the current time stamp + cpu_data_buf[new].time_stamp = get_time_stamp(); + // Now pull the idle stat from each cpu line + char buf[BUF_SIZE]; while (fgets(buf, BUF_SIZE, fs)) { -#if (NEED_MORE_THAN_IDLE) - if (strstr(buf, "cpu ") == buf) { - // Skip system total line... - continue; - } - int cpu_id; - uint64_t tmpul[9]; // assume 9 fields for /proc/stat cpu lines - memset(tmpul, 0, sizeof(tmpul)); - if (9 <= sscanf(buf, "cpu%d %Lu %Lu %Lu %Lu %Lu %Lu %Lu %Lu %Lu", - &cpu_id, &tmpul[0], &tmpul[1], &tmpul[2], &tmpul[3], - &tmpul[4], &tmpul[5], &tmpul[6], &tmpul[7], &tmpul[8])) { - cpu_data_buf[new].cpu_stats[cpu_id].user = tmpul[0]; - cpu_data_buf[new].cpu_stats[cpu_id].nice = tmpul[1]; - cpu_data_buf[new].cpu_stats[cpu_id].system = tmpul[2]; - cpu_data_buf[new].cpu_stats[cpu_id].idle = tmpul[3]; - cpu_data_buf[new].cpu_stats[cpu_id].iowait = tmpul[4]; - cpu_data_buf[new].cpu_stats[cpu_id].irq = tmpul[5]; - cpu_data_buf[new].cpu_stats[cpu_id].softirq = tmpul[6]; - cpu_data_buf[new].cpu_stats[cpu_id].steal = tmpul[7]; - cpu_data_buf[new].cpu_stats[cpu_id].guest = tmpul[8]; - } -#else + /* + * Lines are of the form: + * + * cpu user nice system idle iowait irq softirq steal guest guest_nice + * + * # cat /proc/stat + * cpu 11105906 0 78639 3359578423 24607 151679 322319 0 0 0 + * cpu0 190540 0 1071 52232942 39 7538 234039 0 0 0 + * cpu1 124519 0 50 52545188 0 1443 6267 0 0 0 + * cpu2 143133 0 452 52531440 36 1573 834 0 0 0 + * . . . . + */ if ( (buf[0] == 'c') && (buf[1] == 'p') && (buf[2] == 'u') && (isdigit(buf[3])) ) { - register char *p = &buf[3]; + char *p = &buf[3]; int cpu_id = *p++ - '0'; while (isdigit(*p)) { cpu_id *= 10; cpu_id += (*p++ - '0'); } while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip user while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip nice while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip system while (!isdigit(*p)) { p++; } uint64_t idle = *p++ - '0'; while (isdigit(*p)) { idle *= 10; idle += (*p++ - '0'); } - cpu_data_buf[new].cpu_stats[cpu_id].idle = idle; + cpu_data_buf[new].idle[cpu_id] = idle; } -#endif } - fclose(fs); cur_cpu_data_buf = new; - return (cpu_data_buf[1 - new].uptime > 0); // True if both buffers valid -fail_proc_stat: - numad_log(LOG_CRIT, "Cannot get /proc/stat contents\n"); - exit(EXIT_FAILURE); } +int node_and_digits(const struct dirent *dptr) { + char *p = (char *)(dptr->d_name); + if (*p++ != 'n') return 0; + if (*p++ != 'o') return 0; + if (*p++ != 'd') return 0; + if (*p++ != 'e') return 0; + do { + if (!isdigit(*p++)) + return 0; + } while (*p != '\0'); + return 1; +} + + +id_list_p all_cpus_list_p = NULL; +id_list_p all_nodes_list_p = NULL; +uint64_t node_info_time_stamp = 0; + + int update_nodes() { - num_nodes = 0; - CLEAR_LIST(all_cpus_list_p); - CLEAR_LIST(all_nodes_list_p); - // While counting num_nodes, get various data for each node - for (;;) { - char buf[BUF_SIZE]; - char fname[FNAME_SIZE]; - // Get all the CPU IDs in this node... Read lines from node/cpulist - // file, and set the corresponding bits in the node cpu list. - snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/cpulist", num_nodes); - FILE *fs = fopen(fname, "r"); - if ((!fs) || (!fgets(buf, BUF_SIZE, fs))) { - break; // Assume the only failure here will be the end of the node directories.... - } - fclose(fs); - // Assign the node id - if (num_nodes >= MAX_NODES) { - numad_log(LOG_CRIT, "NODE table too small\n"); + char fname[FNAME_SIZE]; + char buf[BIG_BUF_SIZE]; + // First, check to see if we should refresh basic node info that probably never changes... + uint64_t time_stamp = get_time_stamp(); +#define STATIC_NODE_INFO_DELAY (600 * ONE_HUNDRED) + if ((num_nodes == 0) || (node_info_time_stamp + STATIC_NODE_INFO_DELAY < time_stamp)) { + // Count directory names of the form: /sys/devices/system/node/node + struct dirent **namelist; + int num_files = scandir ("/sys/devices/system/node", &namelist, node_and_digits, NULL); + if (num_files < 1) { + numad_log(LOG_CRIT, "Could not get NUMA node info\n"); exit(EXIT_FAILURE); } - node[num_nodes].node_id = num_nodes; - ADD_ID_TO_LIST(num_nodes, all_nodes_list_p); - // get cpulist from the cpulist string - CLEAR_LIST(node[num_nodes].cpu_list_p); - int n = add_ids_to_list_from_str(node[num_nodes].cpu_list_p, buf); - OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[num_nodes].cpu_list_p); - node[num_nodes].CPUs_total = n * CPU_SCALE_FACTOR; - // Get distance vector of ACPI SLIT data from node/distance file - snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/distance", num_nodes); - fs = fopen(fname, "r"); - if (!fs) { - numad_log(LOG_CRIT, "Unexpected fopen() failure while getting node distance data\n"); - exit(EXIT_FAILURE); - } - int rnode = 0; -get_next_latf_line: - while (fgets(buf, BUF_SIZE, fs)) { - char *p = buf; - for (;;) { - // skip over non-digits - while (!isdigit(*p)) { - if (*p++ == '\0') { - goto get_next_latf_line; - } - } - int latf; - CONVERT_DIGITS_TO_NUM(p, latf); - node[num_nodes].distance[rnode++] = latf; + int need_to_realloc = (num_files != num_nodes); + if (need_to_realloc) { + for (int ix = num_files; (ix < num_nodes); ix++) { + // If new < old, free old node_data pointers + free(node[ix].distance); + FREE_LIST(node[ix].cpu_list_p); } + node = realloc(node, (num_files * sizeof(node_data_t))); + if (node == NULL) { + numad_log(LOG_CRIT, "node realloc failed\n"); + exit(EXIT_FAILURE); + } + for (int ix = num_nodes; (ix < num_files); ix++) { + // If new > old, nullify new node_data pointers + node[ix].distance = NULL; + node[ix].cpu_list_p = NULL; + } + num_nodes = num_files; } - fclose(fs); - // Get available memory info from node/meminfo file - snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", num_nodes); - fs = fopen(fname, "r"); - if (!fs) { - numad_log(LOG_CRIT, "Unexpected fopen() failure while getting node meminfo\n"); - exit(EXIT_FAILURE); - } - int MemTotal = 0; - int MemFree = 0; -get_next_mem_line: - while (fgets(buf, BUF_SIZE, fs) && ((!MemTotal) || (!MemFree))) { - uint64_t *MB_ptr = NULL; - char *p = strstr(buf, "MemTotal"); - if (p) { - MemTotal = 1; - MB_ptr = &node[num_nodes].MBs_total; - } else { - p = strstr(buf, "MemFree"); - if (p) { - MemFree = 1; - MB_ptr = &node[num_nodes].MBs_free; - } else { - goto get_next_mem_line; - } - } - // skip over non-digits - while (!isdigit(*p)) { - if (*p++ == '\0') { - goto get_next_mem_line; - } - } - uint64_t KB; - CONVERT_DIGITS_TO_NUM(p, KB); - *MB_ptr = KB / 1024; - } - fclose(fs); - // Increment node number - num_nodes += 1; - } - // Sum CPU idle data for each node to calculate current node available capacity - if (update_cpu_data()) { + CLEAR_LIST(all_cpus_list_p); + CLEAR_LIST(all_nodes_list_p); + // For each "node" filename present, save in node[ix].node_id + // Note that the node id might not necessarily match the node ix. + // Also populate the cpu lists and distance vectors for this node. for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { + int node_id; + char *p = &namelist[node_ix]->d_name[4]; + CONVERT_DIGITS_TO_NUM(p, node_id); + free(namelist[node_ix]); + node[node_ix].node_id = node_id; + ADD_ID_TO_LIST(node_id, all_nodes_list_p); + // Get all the CPU IDs in this node... Read lines from node/cpulist + // file, and set the corresponding bits in the node cpu list. + snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/cpulist", node_id); + int fd = open(fname, O_RDONLY, 0); + if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) { + // get cpulist from the cpulist string + CLEAR_LIST(node[node_ix].cpu_list_p); + int n = add_ids_to_list_from_str(node[node_ix].cpu_list_p, buf); + OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[node_ix].cpu_list_p); + node[node_ix].CPUs_total = n * ONE_HUNDRED; + close(fd); + } else { + numad_log(LOG_CRIT, "Could not get node cpu list\n"); + exit(EXIT_FAILURE); + } + // Get distance vector of ACPI SLIT data from node/distance file + if (need_to_realloc) { + node[node_ix].distance = realloc(node[node_ix].distance, (num_nodes * sizeof(uint8_t))); + if (node[node_ix].distance == NULL) { + numad_log(LOG_CRIT, "node distance realloc failed\n"); + exit(EXIT_FAILURE); + } + } + snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/distance", node_id); + fd = open(fname, O_RDONLY, 0); + if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) { + int rnode = 0; + for (char *p = buf; (*p != '\n'); ) { + int lat; + CONVERT_DIGITS_TO_NUM(p, lat); + node[node_ix].distance[rnode++] = lat; + while (*p == ' ') { p++; } + } + close(fd); + } else { + numad_log(LOG_CRIT, "Could not get node distance data\n"); + exit(EXIT_FAILURE); + } + } + free(namelist); + } + // Second, get the dynamic free memory and available CPU capacity + update_cpu_data(); + for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { + int node_id = node[node_ix].node_id; + // Get available memory info from node/meminfo file + snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", node_id); + int fd = open(fname, O_RDONLY, 0); + if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) { + uint64_t KB; + char *p = strstr(buf, "MemTotal:"); + if (p != NULL) { + p += 9; + } else { + numad_log(LOG_CRIT, "Could not get node MemTotal\n"); + exit(EXIT_FAILURE); + } + while (!isdigit(*p)) { p++; } + CONVERT_DIGITS_TO_NUM(p, KB); + node[node_ix].MBs_total = KB / KILOBYTE; + p = strstr(p, "MemFree:"); + if (p != NULL) { + p += 8; + } else { + numad_log(LOG_CRIT, "Could not get node MemFree\n"); + exit(EXIT_FAILURE); + } + while (!isdigit(*p)) { p++; } + CONVERT_DIGITS_TO_NUM(p, KB); + node[node_ix].MBs_free = KB / KILOBYTE; + close(fd); + } else { + numad_log(LOG_CRIT, "Could not get node meminfo\n"); + exit(EXIT_FAILURE); + } + // If both buffers have been populated by now, sum CPU idle data + // for each node in order to calculate available capacity + int old_cpu_data_buf = 1 - cur_cpu_data_buf; + if (cpu_data_buf[old_cpu_data_buf].time_stamp > 0) { uint64_t idle_ticks = 0; - int old = 1 - cur_cpu_data_buf; int cpu = 0; - int n = node[node_ix].CPUs_total / 100; - while (n) { + int num_cpus_to_process = node[node_ix].CPUs_total / ONE_HUNDRED; + while (num_cpus_to_process) { if (ID_IS_IN_LIST(cpu, node[node_ix].cpu_list_p)) { - node_from_cpu[cpu] = node_ix; // construct cpu to node map - idle_ticks += cpu_data_buf[cur_cpu_data_buf].cpu_stats[cpu].idle - cpu_data_buf[old].cpu_stats[cpu].idle; - n -= 1; + idle_ticks += cpu_data_buf[cur_cpu_data_buf].idle[cpu] + - cpu_data_buf[old_cpu_data_buf].idle[cpu]; + num_cpus_to_process -= 1; } cpu += 1; } - uint64_t uptime_diff = cpu_data_buf[cur_cpu_data_buf].uptime - cpu_data_buf[old].uptime; - // printf("Node: %d CPUs: %d Uptime diff %ld Idle ticks %ld\n", node_ix, node[node_ix].CPUs_total, uptime_diff, idle_ticks); - node[node_ix].CPUs_free = (100 * idle_ticks) / uptime_diff; + uint64_t time_diff = cpu_data_buf[cur_cpu_data_buf].time_stamp + - cpu_data_buf[old_cpu_data_buf].time_stamp; + // printf("Node: %d CPUs: %ld time diff %ld Idle ticks %ld\n", node_id, node[node_ix].CPUs_total, time_diff, idle_ticks); + // assert(time_diff > 0); + node[node_ix].CPUs_free = (idle_ticks * ONE_HUNDRED) / time_diff; if (node[node_ix].CPUs_free > node[node_ix].CPUs_total) { node[node_ix].CPUs_free = node[node_ix].CPUs_total; } node[node_ix].magnitude = node[node_ix].CPUs_free * node[node_ix].MBs_free; + } else { + node[node_ix].CPUs_free = 0; + node[node_ix].magnitude = 0; } } - // FIXME: add code here to calculate a new capacity vector scaled more precisely by node distance if (log_level >= LOG_INFO) { show_nodes(); } @@ -1278,77 +1267,120 @@ get_next_mem_line: } +typedef struct stat_data { + // This structure isn't actually used in numad -- it is here just to + // document the field type and order of the /proc//stat items, some of + // which are used in the process_data_t structure. + int pid; // 0 + char *comm; // 1 + char state; + int ppid; + int pgrp; + int session; + int tty_nr; + int tpgid; + unsigned flags; + uint64_t minflt; + uint64_t cminflt; + uint64_t majflt; + uint64_t cmajflt; + uint64_t utime; // 13 + uint64_t stime; // 14 + int64_t cutime; + int64_t cstime; + int64_t priority; // 17 + int64_t nice; + int64_t num_threads; // 19 + int64_t itrealvalue; + uint64_t starttime; + uint64_t vsize; + int64_t rss; // 23 + uint64_t rsslim; + uint64_t startcode; + uint64_t endcode; + uint64_t startstack; + uint64_t kstkesp; + uint64_t kstkeip; + uint64_t signal; + uint64_t blocked; + uint64_t sigignore; + uint64_t sigcatch; + uint64_t wchan; + uint64_t nswap; + uint64_t cnswap; + int exit_signal; + int processor; + unsigned rt_priority; + unsigned policy; // 40 + uint64_t delayacct_blkio_ticks; + uint64_t guest_time; // 42 + int64_t cguest_time; +} stat_data_t, *stat_data_p; -stat_data_p get_stat_data(char *fname) { - FILE *fs = fopen(fname, "r"); - if (!fs) { +process_data_p get_stat_data_for_pid(int pid, char *pid_string) { + // Note: This function uses static data buffers and is not thread safe. + char fname[FNAME_SIZE]; + if (pid >= 0) { + snprintf(fname, FNAME_SIZE, "/proc/%d/stat", pid); + } else { + snprintf(fname, FNAME_SIZE, "/proc/%s/stat", pid_string); + } + int fd = open(fname, O_RDONLY, 0); + if (fd < 0) { numad_log(LOG_WARNING, "Could not open stat file: %s\n", fname); return NULL; } static char buf[BUF_SIZE]; - register char *p = fgets(buf, BUF_SIZE, fs); - fclose(fs); - if (!p) { + int bytes = read(fd, buf, BUF_SIZE); + if (bytes < 50) { numad_log(LOG_WARNING, "Could not read stat file: %s\n", fname); return NULL; } - static stat_data_t data; -#if (NEED_ALL_STAT_DATA) - static char comm_buf[BUF_SIZE]; - data.comm = comm_buf; - int scanf_elements = sscanf(buf, - "%d %s %c %d %d %d %d %d %u %lu %lu %lu %lu %lu %lu %ld %ld " - "%ld %ld %ld %ld %llu %lu %ld %lu %lu %lu %lu %lu %lu %lu %lu " - "%lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld", - &data.pid, data.comm, &data.state, &data.ppid, &data.pgrp, - &data.session, &data.tty_nr, &data.tpgid, &data.flags, &data.minflt, - &data.cminflt, &data.majflt, &data.cmajflt, &data.utime, &data.stime, - &data.cutime, &data.cstime, &data.priority, &data.nice, - &data.num_threads, &data.itrealvalue, &data.starttime, &data.vsize, - &data.rss, &data.rsslim, &data.startcode, &data.endcode, - &data.startstack, &data.kstkesp, &data.kstkeip, &data.signal, - &data.blocked, &data.sigignore, &data.sigcatch, &data.wchan, - &data.nswap, &data.cnswap, &data.exit_signal, &data.processor, - &data.rt_priority, &data.policy, &data.delayacct_blkio_ticks, - &data.guest_time, &data.cguest_time); - if (scanf_elements < 43) { - numad_log(LOG_WARNING, "Could not parse stat file\n"); - return NULL; - } -#else - // Just parse a select few of the data items - data.pid = *p++ - '0'; while (isdigit(*p)) { data.pid *= 10; data.pid += (*p++ - '0'); } + close(fd); + char *p = buf; + static process_data_t data; + // Get PID from field 0 + uint64_t val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } + data.pid = val; + // Copy comm from field 1 while (*p == ' ') { p++; } data.comm = p; while (*p != ' ') { p++; } - *p++ = '\0'; + *p++ = '\0'; // replacing the presumed single ' ' before next field + // Skip fields 2 through 12 for (int ix = 0; (ix < 11); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } - data.utime = *p++ - '0'; while (isdigit(*p)) { data.utime *= 10; data.utime += (*p++ - '0'); } + // Get utime from field 13 for cpu_util + val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } + data.cpu_util = val; + // Get stime from field 14 to add on to cpu_util (which already has utime) while (*p == ' ') { p++; } - data.stime = *p++ - '0'; while (isdigit(*p)) { data.stime *= 10; data.stime += (*p++ - '0'); } + val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } + data.cpu_util += val; + // Skip fields 15 through 18 while (*p == ' ') { p++; } for (int ix = 0; (ix < 4); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } - data.num_threads = *p++ - '0'; while (isdigit(*p)) { data.num_threads *= 10; data.num_threads += (*p++ - '0'); } + // Get num_threads from field 19 + val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } + data.num_threads = val; + // Skip fields 20 through 22 while (*p == ' ') { p++; } for (int ix = 0; (ix < 3); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } - data.rss = *p++ - '0'; while (isdigit(*p)) { data.rss *= 10; data.rss += (*p++ - '0'); } - while (*p == ' ') { p++; } - for (int ix = 0; (ix < 14); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } - data.processor = *p++ - '0'; while (isdigit(*p)) { data.processor *= 10; data.processor += (*p++ - '0'); } -#endif + // Get rss from field 23 to compute MBs_used + val = *p++ - '0'; while (isdigit(*p)) { val *= 10; val += (*p++ - '0'); } + data.MBs_used = (val * page_size_in_bytes) / MEGABYTE; + // Return pointer to data return &data; } int update_processes() { - // Scan /proc//stat files for processes we should perhaps manage. For all - // processes, evaluate whether or not they should be added to our hash - // table short list of managed processes candidates. If so, update the statistics, - // time stamp and utilization numbers for the select processes in the hash table. - // Then, go through all managed processes to prune out-of-date or dead ones. - uint64_t this_update_time = get_uptime(NULL); - stat_data_p sdata_p; - char fname[FNAME_SIZE]; + // Conditionally scan /proc//stat files for processes we should + // perhaps manage. For all processes, evaluate whether or not they should + // be added to our hash table of managed processes candidates. If so, + // update the statistics, time stamp and utilization numbers for the select + // processes in the hash table. + uint64_t this_update_time = get_time_stamp(); + int new_candidates = 0; // limit number of new candidates per update int files = 0; if (scan_all_processes) { struct dirent **namelist; @@ -1358,144 +1390,95 @@ int update_processes() { exit(EXIT_FAILURE); } for (int ix = 0; (ix < files); ix++) { - snprintf(fname, FNAME_SIZE, "/proc/%s/stat", namelist[ix]->d_name); - if ((sdata_p = get_stat_data(fname)) != NULL) { - // See if this process significant enough to be managed on short list - // FIXME: should not this first conditional be the same as magnitude test in manage_load()? - if ( (sdata_p->rss * page_size_in_bytes > MEMORY_THRESHOLD * MEGABYTE)) { - process_data_t pdata; - pdata.uptime = get_uptime(NULL); // get_uptime() soon after get_stat_data() - // FIXME: Replace unconditional "1" below with more filters - // and tests here as well as perhaps explicit process - // inclusion and exclusion lists, before we decide to add - // this process to our managed set of candidate processes. - // - // Should perhaps include all VM processes as well as any - // other process with a non-default cpuset. - // - // printf("PID: %d %s %d %lu\n", sdata_p->pid, sdata_p->comm, sdata_p->num_threads, sdata_p->vsize); - if (1) { - pdata.pid = sdata_p->pid; - // pdata.pgrp = sdata_p->pgrp; - pdata.comm = sdata_p->comm; - pdata.utime = sdata_p->utime; - pdata.stime = sdata_p->stime; - // pdata.priority = sdata_p->priority; - // pdata.nice = sdata_p->nice; - pdata.num_threads = sdata_p->num_threads; - // pdata.starttime = sdata_p->starttime; - // pdata.vsize_MBs = sdata_p->vsize / MEGABYTE; - pdata.rss_MBs = (sdata_p->rss * page_size_in_bytes) / MEGABYTE; - pdata.processor = sdata_p->processor; - // pdata.rt_priority = sdata_p->rt_priority; - // pdata.policy = sdata_p->policy; - // pdata.guest_time = sdata_p->guest_time; - process_hash_update(&pdata); - } + process_data_p data_p; + if ((data_p = get_stat_data_for_pid(-1, namelist[ix]->d_name)) != NULL) { + // See if this process uses enough memory to be managed. + if ((data_p->MBs_used > MEMORY_THRESHOLD) + && (new_candidates < process_hash_table_size / 3)) { + data_p->data_time_stamp = get_time_stamp(); + new_candidates += process_hash_update(data_p); } - } + free(namelist[ix]); } free(namelist); - } - // Include candidate processes from the explicit pid list, if not already in hash table + } // scan_all_processes + // Process explicit inclusion and exclusion pid lists pthread_mutex_lock(&pid_list_mutex); - pid_list_node_p pid_ptr = pid_list; - while (pid_ptr != NULL) { + // Include candidate processes from the explicit include pid list + pid_list_p pid_ptr = include_pid_list; + while ((pid_ptr != NULL) && (new_candidates < process_hash_table_size / 3)) { int hash_ix = process_hash_lookup(pid_ptr->pid); - if ( (hash_ix >= 0) && (process_hash_table[hash_ix].uptime > this_update_time)) { + if ( (hash_ix >= 0) && (process_hash_table[hash_ix].data_time_stamp > this_update_time)) { + // Already in hash table, and recently updated... pid_ptr = pid_ptr->next; - continue; // already in hash table / short list, and recently updated + continue; } - snprintf(fname, FNAME_SIZE, "/proc/%d/stat", pid_ptr->pid); - if ((sdata_p = get_stat_data(fname)) != NULL) { - files += 1; - process_data_t pdata; - pdata.uptime = get_uptime(NULL); // get_uptime() soon after get_stat_data() - pdata.pid = sdata_p->pid; - // pdata.pgrp = sdata_p->pgrp; - pdata.comm = sdata_p->comm; - pdata.utime = sdata_p->utime; - pdata.stime = sdata_p->stime; - // pdata.priority = sdata_p->priority; - // pdata.nice = sdata_p->nice; - pdata.num_threads = sdata_p->num_threads; - // pdata.starttime = sdata_p->starttime; - // pdata.vsize_MBs = sdata_p->vsize / MEGABYTE; - pdata.rss_MBs = (sdata_p->rss * page_size_in_bytes) / MEGABYTE; - pdata.processor = sdata_p->processor; - // pdata.rt_priority = sdata_p->rt_priority; - // pdata.policy = sdata_p->policy; - // pdata.guest_time = sdata_p->guest_time; - process_hash_update(&pdata); + process_data_p data_p; + if ((data_p = get_stat_data_for_pid(pid_ptr->pid, NULL)) != NULL) { + data_p->data_time_stamp = get_time_stamp(); + new_candidates += process_hash_update(data_p); + if (!scan_all_processes) { + files += 1; + } pid_ptr = pid_ptr->next; } else { // no stat file so assume pid dead -- remove it from pid list - remove_pid_from_pid_list(pid_ptr->pid); - pid_ptr = pid_list; // just restart from list beginning + include_pid_list = remove_pid_from_pid_list(include_pid_list, pid_ptr->pid); + pid_ptr = include_pid_list; // just restart from list beginning continue; } } + // Zero CPU utilization for processes on the explicit exclude pid list + pid_ptr = exclude_pid_list; + while (pid_ptr != NULL) { + int hash_ix = process_hash_lookup(pid_ptr->pid); + if (hash_ix >= 0) { + process_hash_table[hash_ix].CPUs_used = 0; + } + pid_ptr = pid_ptr->next; + } pthread_mutex_unlock(&pid_list_mutex); if (log_level >= LOG_INFO) { numad_log(LOG_INFO, "Processes: %d\n", files); } - // Prune out-of-date processes - for (int ix = 0; (ix < PROCESS_HASH_TABLE_SIZE); ix++) { - process_data_p p = &process_hash_table[ix]; - if (p->pid) { - if (p->uptime < this_update_time) { - p->CPUs_used = 0; // Zero old CPU utilization - // Check for dead pids and remove them... - char fname[FNAME_SIZE]; - snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid); - if (access(fname, F_OK) < 0) { - // Seems dead. Forget this pid -- after first checking - // and removing obsolete numad.PID cpuset directories. - snprintf(fname, FNAME_SIZE, "%s/numad.%d", cpuset_dir, p->pid); - if (access(fname, F_OK) == 0) { - numad_log(LOG_NOTICE, "Removing obsolete cpuset: %s\n", fname); - int rc = rmdir(fname); - if (rc == -1) { - numad_log(LOG_ERR, "bad cpuset rmdir\n"); - // exit(EXIT_FAILURE); - } - } - process_hash_remove(p->pid); - } - } - } - } + // Now go through all managed processes to cleanup out-of-date and dead ones. + process_hash_table_cleanup(this_update_time); return files; } - - - id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { - int pid_ix; - process_data_p p = NULL; - static id_list_p existing_mems_list_p; - CLEAR_LIST(existing_mems_list_p); - int num_existing_mems = 0; - uint64_t process_MBs[MAX_NODES]; - memset(process_MBs, 0, sizeof(process_MBs)); -#if (NEED_PROCESS_CPUS) - uint64_t process_CPUs[MAX_NODES]; - memset(process_CPUs, 0, sizeof(process_CPUs)); -#endif + char buf[BUF_SIZE]; + char buf2[BUF_SIZE]; if (log_level >= LOG_DEBUG) { numad_log(LOG_DEBUG, "PICK NODES FOR: PID: %d, CPUs %d, MBs %d\n", pid, cpus, mbs); } + int num_existing_mems = 0; + static id_list_p existing_mems_list_p; + CLEAR_LIST(existing_mems_list_p); + static node_data_p tmp_node; + static uint64_t *process_MBs; + static uint64_t *saved_magnitude_for_node; + static int process_MBs_num_nodes; + uint64_t time_stamp = get_time_stamp(); // For existing processes, get miscellaneous process specific details + int pid_ix; + process_data_p p = NULL; if ((pid > 0) && ((pid_ix = process_hash_lookup(pid)) >= 0)) { p = &process_hash_table[pid_ix]; - char buf[BUF_SIZE]; + // Quick rejection if this process has interleaved memory, but recheck it once an hour... +#define MIN_DELAY_FOR_INTERLEAVE (3600 * ONE_HUNDRED) + if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0) + && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) { + if (log_level >= LOG_DEBUG) { + numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n"); + } + return NULL; + } + // Get cpuset name for this process, and existing mems binding, if any. char fname[FNAME_SIZE]; - - // First get cpuset name for this process, and existing mems binding, if any. snprintf(fname, FNAME_SIZE, "/proc/%d/cpuset", pid); FILE *fs = fopen(fname, "r"); if (!fs) { @@ -1518,35 +1501,51 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { if (log_level >= LOG_DEBUG) { numad_log(LOG_DEBUG, "CPUSET_NAME: %s\n", p->cpuset_name); } - snprintf(fname, FNAME_SIZE, "%s%s/cpuset.mems", cpuset_dir, p->cpuset_name); fs = fopen(fname, "r"); if ((fs) && (fgets(buf, BUF_SIZE, fs))) { fclose(fs); num_existing_mems = add_ids_to_list_from_str(existing_mems_list_p, buf); if (log_level >= LOG_DEBUG) { - // FIXME: just print buf w/o exercising conversion routines str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); - numad_log(LOG_DEBUG, "EXISTING NODE LIST: %s\n", buf); + numad_log(LOG_DEBUG, "EXISTING CPUSET NODE LIST: %s\n", buf); } } - - // If we have bound this process to the same nodes multiple times - // already, and the load on those nodes seems acceptable, skip the rest - // of this and just return NULL to indicate no change needed. FIXME: - // figure out what else can change that should cause a rebinding (e.g. - // (1) some process gets sub-optimal allocation on busy machine which + // If this process was just recently bound, enforce a minimum delay + // period between repeated attempts to potentially move the memory. + // FIXME: ?? might this retard appropriate process expansion too much? +#define MIN_DELAY_FOR_REEVALUATION (30 * ONE_HUNDRED) + if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) { + // Skip re-evaluation because we just did it recently. + if (log_level >= LOG_DEBUG) { + numad_log(LOG_DEBUG, "Skipping evaluation because done too recently.\n"); + } + return NULL; + } + // Look for short cut because of duplicate bindings. If we have bound + // this process to the same nodes multiple times already, and the load + // on those nodes still seems acceptable, skip the rest of this and + // just return NULL to indicate no change needed. FIXME: should figure + // out what can change that would make a rebinding desirable (e.g. (1) + // some process gets sub-optimal allocation on busy machine which // subsequently becomes less busy leaving disadvantaged process. (2) // node load imbalance, (3) any process split across nodes which should - // fit within a single node.) - if (p->dup_bind_count > 1) { + // fit within a single node.) For now, just expire the dup_bid_count + // occasionally, which is a reasonably good mitigation. + // So, check to see if we should decay the dup_bind_count... +#define DUP_BIND_TIME_OUT (300 * ONE_HUNDRED) + if ((p->dup_bind_count > 0) && (p->bind_time_stamp + DUP_BIND_TIME_OUT < time_stamp)) { + p->dup_bind_count -= 1; + } + // Now, look for short cut because of duplicate bindings + if (p->dup_bind_count > 0) { int node_id = 0; int nodes_have_cpu = 1; int nodes_have_ram = 1; int n = num_existing_mems; int min_resource_pct = 100 - target_utilization; - if (min_resource_pct < 10) { - min_resource_pct = 10; + if (min_resource_pct < 5) { + min_resource_pct = 5; } while (n) { if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) { @@ -1566,96 +1565,88 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { numad_log(LOG_DEBUG, "Evaluated for skipping by repeat binding, but CPUS: %d, RAM: %d\n", nodes_have_cpu, nodes_have_ram); } } - -// FIXME: this scanning is expensive and must be minimized. Not only is it -// expensive, but older kernels dismantle transparent huge pages while -// producing the numa memory map information!! This THP issue is fixed in new -// kernels -- and hopefully included in RHEL6.3 -- but the scanning will always -// remain expensive and something that should be minimized... - - // Second, add up per-node memory in use by this process + // Fourth, add up per-node memory in use by this process. This scanning + // is expensive and should be minimized. Also, old kernels dismantle + // transparent huge pages while producing the numa_maps memory + // information! + // Check to see if dynamic structures need to grow. + if (process_MBs_num_nodes < num_nodes + 1) { + process_MBs_num_nodes = num_nodes + 1; + // The "+1 node" is for accumulating interleaved memory + process_MBs = realloc(process_MBs, process_MBs_num_nodes * sizeof(uint64_t)); + tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) ); + saved_magnitude_for_node = realloc(saved_magnitude_for_node, num_nodes * sizeof(uint64_t)); + if ((process_MBs == NULL) || (tmp_node == NULL) || (saved_magnitude_for_node == NULL)) { + numad_log(LOG_CRIT, "process_MBs realloc failed\n"); + exit(EXIT_FAILURE); + } + } + memset(process_MBs, 0, process_MBs_num_nodes * sizeof(uint64_t)); snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid); fs = fopen(fname, "r"); if (!fs) { numad_log(LOG_WARNING, "Tried to research PID %d numamaps, but it apparently went away.\n", p->pid); - return NULL; // Assume the process terminated? + return NULL; // Assume the process terminated } + int process_has_interleaved_memory = 0; while (fgets(buf, BUF_SIZE, fs)) { + int interleaved_memory = 0; uint64_t page_size = page_size_in_bytes; - const char *delimiters = " \t\r\n"; + const char *delimiters = " \n"; char *p = strtok(buf, delimiters); while (p) { - if (!strcmp(p, "huge")) { + if (!strncmp(p, "interleave", 10)) { + interleaved_memory = 1; + process_has_interleaved_memory = 1; + } else if (!strcmp(p, "huge")) { page_size = huge_page_size_in_bytes; - } else if (p[0] == 'N') { - int node = (int)strtol(&p[1], &p, 10); - if (p[0] != '=') { + } else if (*p++ == 'N') { + int node; + uint64_t pages; + CONVERT_DIGITS_TO_NUM(p, node); + if (*p++ != '=') { numad_log(LOG_CRIT, "numa_maps node number parse error\n"); exit(EXIT_FAILURE); } - uint64_t pages = strtol(&p[1], &p, 10); + CONVERT_DIGITS_TO_NUM(p, pages); process_MBs[node] += (pages * page_size); + if (interleaved_memory) { + // sum interleaved quantity in "extra node" + process_MBs[num_nodes] += (pages * page_size); + } } // Get next token on the line p = strtok(NULL, delimiters); } } - for (int ix = 0; (ix < num_nodes); ix++) { + fclose(fs); + for (int ix = 0; (ix <= num_nodes); ix++) { process_MBs[ix] /= MEGABYTE; if (log_level >= LOG_DEBUG) { numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]); } } - fclose(fs); - -#if (NEED_PROCESS_CPUS) - - // Third, add up per-node CPUs recently used by this process - snprintf(fname, FNAME_SIZE, "/proc/%d/task", pid); - struct dirent **namelist; - int files = scandir(fname, &namelist, name_starts_with_digit, NULL); - if (files < 0) { - numad_log(LOG_WARNING, "Tried to research PID %d tasks, but it apparently went away.\n", p->pid); - return NULL; // Assume the process terminated? - } - for (int ix = 0; (ix < files); ix++) { - snprintf(fname, FNAME_SIZE, "/proc/%d/task/%s/stat", pid, namelist[ix]->d_name); - stat_data_p sdata_p = get_stat_data(fname); - if (sdata_p != NULL) { - process_CPUs[node_from_cpu[sdata_p->processor]] += 1; - } - } - for (int ix = 0; (ix < num_nodes); ix++) { - // Assume average load per thread - process_CPUs[ix] *= (p->CPUs_used / p->num_threads); + if (process_has_interleaved_memory) { + // Mark this process as having interleaved memory, and stamp it as done. + p->flags |= PROCESS_FLAG_INTERLEAVED; + p->bind_time_stamp = get_time_stamp(); if (log_level >= LOG_DEBUG) { - numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, process_CPUs[ix]); + numad_log(LOG_DEBUG, "Skipping evaluation because of interleaved memory.\n"); } + return NULL; } - -#endif - } // end of existing PID conditional - - - - // Make a copy of node available resources array - static node_data_p tmp_node; // FIXME: what if num_nodes changes after allocate - if (tmp_node == NULL) { - tmp_node = malloc(num_nodes * sizeof(node_data_t) ); - if (tmp_node == NULL) { - numad_log(LOG_CRIT, "malloc failed\n"); - exit(EXIT_FAILURE); - } - } - memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) ); - - // Add in info specific to this process to equalize available resource - // quantities wrt locations of resources already in use by this process. - // Average the amount of CPUs_free across the existing nodes used, because - // the threads are free to move around in that domain. Inflate the value - // of already assigned memory by 3/2, because moving memory is expensive. + // Make a copy of node available resources array. Add in info specific to + // this process to equalize available resource quantities wrt locations of + // resources already in use by this process. Inflate the value of already + // assigned memory by approximately 3/2, because moving memory is + // expensive. Average the amount of CPUs_free across the existing nodes + // used, because the threads are free to move around in that domain. After + // calculating combined magnitude of available resources, bias the values + // towards existing locations for this process. + int target_using_all_nodes = 0; uint64_t node_CPUs_free_for_this_process = 0; + memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) ); if (num_existing_mems > 0) { node_CPUs_free_for_this_process = cpus; // ?? Correct for utilization target inflation? int node_id = 0; @@ -1671,61 +1662,50 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { node_CPUs_free_for_this_process /= num_existing_mems; } for (int ix = 0; (ix < num_nodes); ix++) { - tmp_node[ix].MBs_free += ((process_MBs[ix] * 3) / 2); - -#if (NEED_PROCESS_CPUS) - - if (EQUAL_LISTS(existing_mems_list_p, all_nodes_list_p)) { - // If not already bound, consider only available memory for now (by marking all CPUs free) - tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total; - } else { - // If already bound, consider existing location of CPUs - tmp_node[ix].CPUs_free += process_CPUs[ix]; - } - -#endif - + if (pid > 0) { + tmp_node[ix].MBs_free += ((process_MBs[ix] * 12) / 8); + } if ((num_existing_mems > 0) && (ID_IS_IN_LIST(ix, existing_mems_list_p))) { tmp_node[ix].CPUs_free = node_CPUs_free_for_this_process; - if (log_level >= LOG_DEBUG) { - numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, node_CPUs_free_for_this_process); - } } if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) { tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total; } + if (log_level >= LOG_DEBUG) { + numad_log(LOG_DEBUG, "PROCESS_CPUs[%d]: %ld\n", ix, tmp_node[ix].CPUs_free); + } // Calculate magnitude as product of available CPUs and available MBs tmp_node[ix].magnitude = tmp_node[ix].CPUs_free * tmp_node[ix].MBs_free; // Bias combined magnitude towards already assigned nodes if (ID_IS_IN_LIST(ix, existing_mems_list_p)) { - tmp_node[ix].magnitude *= 5; - tmp_node[ix].magnitude /= 4; + tmp_node[ix].magnitude *= 9; + tmp_node[ix].magnitude /= 8; } - } - - - - uint64_t saved_magnitude_for_node[MAX_NODES]; - for (int ix = 0; (ix < num_nodes); ix++) { + // Save the current magnitudes saved_magnitude_for_node[ix] = tmp_node[ix].magnitude; } - + // OK, figure out where to get resources for this request. static id_list_p target_node_list_p; CLEAR_LIST(target_node_list_p); - int prev_node = -1; - - // Allocate sufficient resources - while ((mbs > 0) || (cpus > 20)) { - - // First, sort nodes by magnitude + int prev_node_used = -1; + // Continue to allocate more resources until request are met. + // OK if not not quite all the CPU request is met. + // FIXME: ?? Is the following too much CPU flexing? + while ((mbs > 0) || (cpus > (tmp_node[0].CPUs_total / 4))) { + if (log_level >= LOG_DEBUG) { + numad_log(LOG_DEBUG, "MBs: %d, CPUs: %d\n", mbs, cpus); + } + // Sort nodes by magnitude of available resources. Note that + // inter-node distances (to the previous node used) are factored into + // the sort. for (int ij = 0; (ij < num_nodes); ij++) { int big_ix = ij; - for (int ik = ij; (ik < num_nodes); ik++) { + for (int ik = ij + 1; (ik < num_nodes); ik++) { uint64_t ik_dist = 1; uint64_t big_ix_dist = 1; - if (prev_node >= 0) { - ik_dist = tmp_node[ik].distance[prev_node]; - big_ix_dist = tmp_node[big_ix].distance[prev_node]; + if (prev_node_used >= 0) { + ik_dist = tmp_node[ik].distance[prev_node_used]; + big_ix_dist = tmp_node[big_ix].distance[prev_node_used]; } // Scale magnitude comparison by distances to previous node used... if ((tmp_node[big_ix].magnitude / big_ix_dist) < (tmp_node[ik].magnitude / ik_dist)) { @@ -1739,26 +1719,31 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { memcpy((void *)&tmp_node[big_ix], (void *)&tmp, sizeof(node_data_t) ); } } - if (log_level >= LOG_DEBUG) { for (int ix = 0; (ix < num_nodes); ix++) { numad_log(LOG_DEBUG, "Sorted magnitude[%d]: %ld\n", tmp_node[ix].node_id, tmp_node[ix].magnitude); } } - - if (tmp_node[0].node_id == prev_node) { - // Not going to make progress... Just use everything + if (tmp_node[0].node_id == prev_node_used) { + // Hmmm. Looks like the best node for more resources, is also the + // last one we used. This is not going to make progress... So + // just punt and use everything. OR_LISTS(target_node_list_p, target_node_list_p, all_nodes_list_p); + target_using_all_nodes = 1; break; } - prev_node = tmp_node[0].node_id; - + prev_node_used = tmp_node[0].node_id; ADD_ID_TO_LIST(tmp_node[0].node_id, target_node_list_p); - if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) { - break; // Apparently must use all resource nodes... + if (log_level >= LOG_DEBUG) { + str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); + str_from_id_list(buf2, BUF_SIZE, target_node_list_p); + numad_log(LOG_DEBUG, "Existing nodes: %s Target nodes: %s\n", buf, buf2); + } + if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) { + // Apparently we must use all resource nodes... + target_using_all_nodes = 1; + break; } - - #define MBS_MARGIN 10 if (tmp_node[0].MBs_free >= (mbs + MBS_MARGIN)) { tmp_node[0].MBs_free -= mbs; @@ -1776,19 +1761,7 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { tmp_node[0].CPUs_free = CPUS_MARGIN; } tmp_node[0].magnitude = tmp_node[0].CPUs_free * tmp_node[0].MBs_free; - - -#if 0 - // FIXME: delete this old hack since distance now incorporated in sort above - // adjust all the magnitudes by the distance in very sketchy way - for (int ix = 1; (ix < num_nodes); ix++) { - tmp_node[ix].magnitude /= tmp_node[0].distance[tmp_node[ix].node_id]; - } -#endif - - } - // If this existing process is already located where we want it, and almost // all memory is already moved to those nodes, then return NULL indicating // no need to change binding this time. @@ -1797,50 +1770,52 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { // memory still on non-target nodes, advise the bind anyway because // there are some scenarios when the kernel will not move it all the // first time. - - p->dup_bind_count += 1; - for (int ix = 0; (ix < num_nodes); ix++) { - if ((process_MBs[ix] > 10) && (!ID_IS_IN_LIST(ix, target_node_list_p))) { - goto try_memory_move_again; + if (!target_using_all_nodes) { + p->dup_bind_count += 1; + for (int ix = 0; (ix < num_nodes); ix++) { + if ((process_MBs[ix] > 10) && (!ID_IS_IN_LIST(ix, target_node_list_p))) { + goto try_memory_move_again; + } } + // We will accept these memory locations. Stamp it as done. + p->bind_time_stamp = get_time_stamp(); + } + // Skip rebinding either because practically all memory is in the + // target nodes, or because we are stuck using all the nodes. + if (log_level >= LOG_DEBUG) { + numad_log(LOG_DEBUG, "Skipping evaluation because memory is reasonably situated.\n"); } return NULL; } else { + // Either a non-existing process, or a new binding for an existing process. if (p != NULL) { + // Must be a new binding for an existing process, so reset dup_bind_count. p->dup_bind_count = 0; } } - // See if this proposed move will make a significant difference. - // If not, return null instead of advising move. + // If not, return null instead of advising the move. uint64_t target_magnitude = 0; uint64_t existing_magnitude = 0; - - int node_id = 0; - int num_existing_nodes = NUM_IDS_IN_LIST(existing_mems_list_p); int num_target_nodes = NUM_IDS_IN_LIST(target_node_list_p); -#if 0 - // FIXME: causes excessive growth - if (num_target_nodes > num_existing_nodes) { - goto try_memory_move_again; - } -#endif + int num_existing_nodes = NUM_IDS_IN_LIST(existing_mems_list_p); + /* FIXME: this expansion seems to cause excessive growth + * So calculate the improvement before hastily expanding nodes. + if (num_target_nodes > num_existing_nodes) { goto try_memory_move_again; } + */ + int node_id = 0; int n = num_existing_nodes + num_target_nodes; while (n) { - if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) { - existing_magnitude += saved_magnitude_for_node[node_id]; - n -= 1; - } if (ID_IS_IN_LIST(node_id, target_node_list_p)) { target_magnitude += saved_magnitude_for_node[node_id]; n -= 1; } + if (ID_IS_IN_LIST(node_id, existing_mems_list_p)) { + existing_magnitude += saved_magnitude_for_node[node_id]; + n -= 1; + } node_id += 1; } - - char buf1[BUF_SIZE]; - char buf2[BUF_SIZE]; - if (existing_magnitude > 0) { uint64_t magnitude_change = ((target_magnitude - existing_magnitude) * 100) / existing_magnitude; if (magnitude_change < 0) { @@ -1849,25 +1824,24 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { if (magnitude_change <= IMPROVEMENT_THRESHOLD_PERCENT) { // Not significant enough percentage change to do rebind if (log_level >= LOG_DEBUG) { - str_from_id_list(buf1, BUF_SIZE, existing_mems_list_p); + str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); str_from_id_list(buf2, BUF_SIZE, target_node_list_p); numad_log(LOG_DEBUG, "Moving pid %d from nodes (%s) to nodes (%s) skipped as insignificant improvement: %ld percent.\n", - pid, buf1, buf2, magnitude_change); + pid, buf, buf2, magnitude_change); } + // We decided this is almost good enough. Stamp it as done. + p->bind_time_stamp = get_time_stamp(); return NULL; } } - try_memory_move_again: - - str_from_id_list(buf1, BUF_SIZE, existing_mems_list_p); + str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); str_from_id_list(buf2, BUF_SIZE, target_node_list_p); char *cmd_name = "(unknown)"; if ((p) && (p->comm)) { cmd_name = p->comm; } - numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf1, buf2); - + numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf, buf2); return target_node_list_p; } @@ -1889,21 +1863,34 @@ void show_processes(process_data_p *ptr, int nprocs) { } fclose(fs); } - fprintf(log_fs, "PID %d: MBs_used %ld, CPUs_used %ld, Magnitude %ld, Threads %ld, Uptime %ld, Name: %s, Nodes: %s\n", - p->pid, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, p->num_threads, p->uptime, p->comm, buf); + fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", + p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf); } fprintf(log_fs, "\n"); fflush(log_fs); } + int manage_loads() { - // Use temporary index to access and sort process hash table - process_data_p pindex[PROCESS_HASH_TABLE_SIZE]; - memset(pindex, 0, sizeof(pindex)); + // Use temporary index to access and sort hash table entries + static process_data_p *pindex; + static int pindex_size; + if (pindex_size < process_hash_table_size) { + pindex_size = process_hash_table_size; + pindex = realloc(pindex, pindex_size * sizeof(process_data_p)); + if (pindex == NULL) { + numad_log(LOG_CRIT, "pindex realloc failed\n"); + exit(EXIT_FAILURE); + } + // Quick round trip whenever we resize the hash table. + // This is mostly to avoid max_interval wait at start up. + return min_interval / 2; + } + memset(pindex, 0, pindex_size * sizeof(process_data_p)); // Copy live candidate pointers to the index for sorting, etc int nprocs = 0; - for (int ix = 0; (ix < PROCESS_HASH_TABLE_SIZE); ix++) { + for (int ix = 0; (ix < process_hash_table_size); ix++) { process_data_p p = &process_hash_table[ix]; if (p->pid) { pindex[nprocs++] = p; @@ -1911,10 +1898,11 @@ int manage_loads() { } // Sort index by amount of CPU used * amount of memory used. Not expecting // a long list here. Use a simple sort -- however, sort into bins, - // treating values within 10% as aquivalent and random which comes first. + // treating values within 10% as aquivalent. Within bins, order by + // bind_time_stamp so oldest bound will be higher priority to evaluate. for (int ij = 0; (ij < nprocs); ij++) { int best = ij; - for (int ik = ij; (ik < nprocs); ik++) { + for (int ik = ij + 1; (ik < nprocs); ik++) { uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_used); uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used); uint64_t min_mag = ik_mag; @@ -1927,8 +1915,8 @@ int manage_loads() { // difference > 10 percent. Use strict ordering if (ik_mag <= best_mag) continue; } else { - // difference within 10 percent. Use random ordering - if (random() > (RAND_MAX / 2)) continue; + // difference within 10 percent. Sort these by bind_time_stamp. + if (pindex[ik]->bind_time_stamp > pindex[best]->bind_time_stamp) continue; } best = ik; } @@ -1949,19 +1937,29 @@ int manage_loads() { } int mb_request = (p->MBs_used * 100) / target_utilization; int cpu_request = (p->CPUs_used * 100) / target_utilization; - // Don't give a process more CPUs than it has threads - // FIXME: For guest VMs, should limit max to VCPU threads (and should - // maybe put VM IO threads in root cgroup). - int thread_limit = p->num_threads * CPU_SCALE_FACTOR; + // Do not give a process more CPUs than it has threads! + // FIXME: For guest VMs, should limit max to VCPU threads. Will + // need to do something more intelligent with guest IO threads + // when eventually considering devices and IRQs. + int thread_limit = p->num_threads; + // If process looks like a KVM guest, try to limit to number of vCPU threads + if ((p->comm) && (p->comm[0] == '(') && (p->comm[1] == 'q') && (strcmp(p->comm, "(qemu-kvm)") == 0)) { + int kvm_vcpu_threads = get_num_kvm_vcpu_threads(p->pid); + if (thread_limit > kvm_vcpu_threads) { + thread_limit = kvm_vcpu_threads; + } + } + thread_limit *= ONE_HUNDRED; if (cpu_request > thread_limit) { cpu_request = thread_limit; } pthread_mutex_lock(&node_info_mutex); id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request); - // FIXME: copy node_list_p to shorten mutex region? + // FIXME: ?? copy node_list_p to shorten mutex region? if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p->pid, p->cpuset_name, node_list_p, NULL))) { // Shorten interval if actively moving processes pthread_mutex_unlock(&node_info_mutex); + p->bind_time_stamp = get_time_stamp(); return min_interval; } pthread_mutex_unlock(&node_info_mutex); @@ -1972,74 +1970,76 @@ int manage_loads() { - void *set_dynamic_options(void *arg) { - int arg_value = *(int *)arg; + // int arg_value = *(int *)arg; + char buf[BUF_SIZE]; for (;;) { - // Loop here forever waiting for a msg to do something... - msg_t msg; - recv_msg(&msg); - switch (msg.body.cmd) { - case 'i': { - min_interval = msg.body.arg1; - max_interval = msg.body.arg2; - if (max_interval <= 0) { - shut_down_numad(); - } - numad_log(LOG_NOTICE, "Changing interval to %d:%d\n", msg.body.arg1, msg.body.arg2); - break; - } - case 'l': { - numad_log(LOG_NOTICE, "Changing log level to %d\n", msg.body.arg1); - log_level = msg.body.arg1; - break; - } - case 'p': { - numad_log(LOG_NOTICE, "Adding PID %d to explicit PID list\n", msg.body.arg1); - pthread_mutex_lock(&pid_list_mutex); - insert_pid_into_pid_list(msg.body.arg1); - pthread_mutex_unlock(&pid_list_mutex); - break; - } - case 'r': { - numad_log(LOG_NOTICE, "Removing PID %d from explicit PID list\n", msg.body.arg1); - pthread_mutex_lock(&pid_list_mutex); - remove_pid_from_pid_list(msg.body.arg1); - pthread_mutex_unlock(&pid_list_mutex); - break; - } - case 'S': { - scan_all_processes = (msg.body.arg1 != 0); - if (scan_all_processes) { - numad_log(LOG_NOTICE, "Scanning all processes\n"); - } else { - numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n"); - } - break; - } - case 'u': { - numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1); - target_utilization = msg.body.arg1; - break; - } - case 'w': { - char buf[BUF_SIZE]; - numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", msg.body.arg1, msg.body.arg2); - pthread_mutex_lock(&node_info_mutex); - int nodes = update_nodes(); - id_list_p node_list_p = pick_numa_nodes(-1, (msg.body.arg1 * CPU_SCALE_FACTOR), msg.body.arg2); - str_from_id_list(buf, BUF_SIZE, node_list_p); - pthread_mutex_unlock(&node_info_mutex); - send_msg(msg.body.src_pid, 'w', requested_cpus, requested_mbs, buf); - break; - } - default: { - numad_log(LOG_WARNING, "Unexpected msg command: %c %d %d %s from PID %d\n", - msg.body.cmd, msg.body.arg1, msg.body.arg1, msg.body.text, msg.body.src_pid); - break; - } - } - } // for (;;) + // Loop here forever waiting for a msg to do something... + msg_t msg; + recv_msg(&msg); + switch (msg.body.cmd) { + case 'i': + min_interval = msg.body.arg1; + max_interval = msg.body.arg2; + if (max_interval <= 0) { + shut_down_numad(); + } + numad_log(LOG_NOTICE, "Changing interval to %d:%d\n", msg.body.arg1, msg.body.arg2); + break; + case 'l': + numad_log(LOG_NOTICE, "Changing log level to %d\n", msg.body.arg1); + log_level = msg.body.arg1; + break; + case 'p': + numad_log(LOG_NOTICE, "Adding PID %d to inclusion PID list\n", msg.body.arg1); + pthread_mutex_lock(&pid_list_mutex); + exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, msg.body.arg1); + include_pid_list = insert_pid_into_pid_list(include_pid_list, msg.body.arg1); + pthread_mutex_unlock(&pid_list_mutex); + break; + case 'r': + numad_log(LOG_NOTICE, "Removing PID %d from explicit PID lists\n", msg.body.arg1); + pthread_mutex_lock(&pid_list_mutex); + include_pid_list = remove_pid_from_pid_list(include_pid_list, msg.body.arg1); + exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, msg.body.arg1); + pthread_mutex_unlock(&pid_list_mutex); + break; + case 'S': + scan_all_processes = (msg.body.arg1 != 0); + if (scan_all_processes) { + numad_log(LOG_NOTICE, "Scanning all processes\n"); + } else { + numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n"); + } + break; + case 'u': + numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1); + target_utilization = msg.body.arg1; + break; + case 'w': + numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", + msg.body.arg1, msg.body.arg2); + pthread_mutex_lock(&node_info_mutex); + update_nodes(); + id_list_p node_list_p = pick_numa_nodes(-1, (msg.body.arg1 * ONE_HUNDRED), msg.body.arg2); + str_from_id_list(buf, BUF_SIZE, node_list_p); + pthread_mutex_unlock(&node_info_mutex); + send_msg(msg.body.src_pid, 'w', requested_cpus, requested_mbs, buf); + break; + case 'x': + numad_log(LOG_NOTICE, "Adding PID %d to exclusion PID list\n", msg.body.arg1); + pthread_mutex_lock(&pid_list_mutex); + include_pid_list = remove_pid_from_pid_list(include_pid_list, msg.body.arg1); + exclude_pid_list = insert_pid_into_pid_list(exclude_pid_list, msg.body.arg1); + pthread_mutex_unlock(&pid_list_mutex); + break; + default: + numad_log(LOG_WARNING, "Unexpected msg command: %c %d %d %s from PID %d\n", + msg.body.cmd, msg.body.arg1, msg.body.arg1, msg.body.text, + msg.body.src_pid); + break; + } + } // for (;;) } @@ -2073,10 +2073,9 @@ void parse_two_arg_values(char *p, int *first_ptr, int *second_ptr, int first_is } + int main(int argc, char *argv[]) { int opt; - long list_pid; - char *p = NULL; int d_flag = 0; int i_flag = 0; int l_flag = 0; @@ -2086,22 +2085,70 @@ int main(int argc, char *argv[]) { int u_flag = 0; int v_flag = 0; int w_flag = 0; - while ((opt = getopt(argc, argv, "dD:hi:l:p:r:S:u:vVw:")) != -1) { - switch (opt) { - case 'd': d_flag = 1; log_level = LOG_DEBUG ; break; - case 'D': cpuset_dir_list[0] = strdup(optarg); break; - case 'h': print_usage_and_exit(argv[0]); break; - case 'i': i_flag = 1; parse_two_arg_values(optarg, &min_interval, &max_interval, 1); break; - case 'l': l_flag = 1; log_level = atoi(optarg); break; - case 'p': p_flag = 1; list_pid = atol(optarg); insert_pid_into_pid_list(list_pid); break; - case 'r': r_flag = 1; list_pid = atol(optarg); remove_pid_from_pid_list(list_pid); break; - case 'S': S_flag = 1; scan_all_processes = (atoi(optarg) != 0); break; - case 'u': u_flag = 1; target_utilization = atoi(optarg); break; - case 'v': v_flag = 1; log_level = LOG_INFO; break; - case 'V': print_version_and_exit(argv[0]); break; - case 'w': w_flag = 1; parse_two_arg_values(optarg, &requested_cpus, &requested_mbs, 0); break; - default: print_usage_and_exit(argv[0]); break; - } + int x_flag = 0; + long list_pid = 0; + while ((opt = getopt(argc, argv, "dD:hi:l:p:r:S:u:vVw:x:")) != -1) { + switch (opt) { + case 'd': + d_flag = 1; + log_level = LOG_DEBUG; + break; + case 'D': + cpuset_dir_list[0] = strdup(optarg); + break; + case 'h': + print_usage_and_exit(argv[0]); + break; + case 'i': + i_flag = 1; + parse_two_arg_values(optarg, &min_interval, &max_interval, 1); + break; + case 'l': + l_flag = 1; + log_level = atoi(optarg); + break; + case 'p': + p_flag = 1; + list_pid = atol(optarg); + exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid); + include_pid_list = insert_pid_into_pid_list(include_pid_list, list_pid); + break; + case 'r': + r_flag = 1; + list_pid = atol(optarg); + // Remove this PID from both explicit pid lists. + include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid); + exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid); + break; + case 'S': + S_flag = 1; + scan_all_processes = (atoi(optarg) != 0); + break; + case 'u': + u_flag = 1; + target_utilization = atoi(optarg); + break; + case 'v': + v_flag = 1; + log_level = LOG_INFO; + break; + case 'V': + print_version_and_exit(argv[0]); + break; + case 'w': + w_flag = 1; + parse_two_arg_values(optarg, &requested_cpus, &requested_mbs, 0); + break; + case 'x': + x_flag = 1; + list_pid = atol(optarg); + include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid); + exclude_pid_list = insert_pid_into_pid_list(exclude_pid_list, list_pid); + break; + default: + print_usage_and_exit(argv[0]); + break; + } } if (argc > optind) { fprintf(stderr, "Unexpected arg = %s\n", argv[optind]); @@ -2113,13 +2160,11 @@ int main(int argc, char *argv[]) { exit(EXIT_FAILURE); } } - open_log_file(); init_msg_queue(); num_cpus = get_num_cpus(); page_size_in_bytes = sysconf(_SC_PAGESIZE); huge_page_size_in_bytes = get_huge_page_size_in_bytes(); - // Figure out if this is the daemon, or a subsequent invocation int daemon_pid = get_daemon_pid(); if (daemon_pid > 0) { @@ -2149,19 +2194,22 @@ int main(int argc, char *argv[]) { recv_msg(&msg); fprintf(stdout, "%s\n", msg.body.text); } + if (x_flag) { + send_msg(daemon_pid, 'x', list_pid, 0, ""); + } } else if (w_flag) { // Get pre-placement NUMA advice without starting daemon char buf[BUF_SIZE]; - int nodes = update_nodes(); + update_nodes(); sleep(2); - nodes = update_nodes(); + update_nodes(); numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs); - id_list_p node_list_p = pick_numa_nodes(-1, (requested_cpus * CPU_SCALE_FACTOR), requested_mbs); + id_list_p node_list_p = pick_numa_nodes(-1, (requested_cpus * ONE_HUNDRED), requested_mbs); str_from_id_list(buf, BUF_SIZE, node_list_p); fprintf(stdout, "%s\n", buf); close_log_file(); exit(EXIT_SUCCESS); - } else { + } else if (max_interval > 0) { // Start the numad daemon... check_prereqs(argv[0]); // Daemonize self... @@ -2184,8 +2232,9 @@ int main(int argc, char *argv[]) { if (log_fs != stderr) { fclose(stderr); } - - // spawn thread to handle messages from subsequent invocation requests + // Allocate initial process hash table + process_hash_table_expand(); + // Spawn thread to handle messages from subsequent invocation requests pthread_mutex_init(&pid_list_mutex, NULL); pthread_mutex_init(&node_info_mutex, NULL); pthread_attr_t attr; @@ -2198,7 +2247,6 @@ int main(int argc, char *argv[]) { numad_log(LOG_CRIT, "pthread_create failure\n"); exit(EXIT_FAILURE); } - // Loop here forwever... for (;;) { int interval = max_interval; @@ -2211,14 +2259,14 @@ int main(int argc, char *argv[]) { } sleep(interval); } - if (pthread_attr_destroy(&attr) != 0) { numad_log(LOG_WARNING, "pthread_attr_destroy failure\n"); } pthread_mutex_destroy(&pid_list_mutex); pthread_mutex_destroy(&node_info_mutex); + } else { + shut_down_numad(); } - exit(EXIT_SUCCESS); } diff --git a/numad.conf b/numad.conf index 12e02cb..ee04006 100644 --- a/numad.conf +++ b/numad.conf @@ -1,6 +1,5 @@ # Config file for numad -# content TBD # -# Default INTERVAL is 10 -# uncomment the below to change it -INTERVAL=10 +# Default INTERVAL is 15 +# modify below to change it +INTERVAL=15