From 87b49d558828d34cb246711627955d76bd6cb22a Mon Sep 17 00:00:00 2001 From: Jan Synacek Date: Mon, 3 Dec 2012 15:36:10 +0100 Subject: [PATCH] Update to 20121130 --- numad.8 | 43 +++++--- numad.c | 300 ++++++++++++++++++++++++++++++++++---------------------- 2 files changed, 215 insertions(+), 128 deletions(-) diff --git a/numad.8 b/numad.8 index e497ea7..2c46f55 100644 --- a/numad.8 +++ b/numad.8 @@ -1,7 +1,8 @@ .TH "numad" "8" "1.0.0" "Bill Gray" "Administration" .SH "numad" .LP -numad \- A user\-level daemon that provides advice and managment for optimum use of CPUs and memory on systems with NUMA topology. +numad \- A user\-level daemon that provides placement advice and process +management for efficient use of CPUs and memory on systems with NUMA topology. .SH "SYNTAX" .LP numad [\fI\-dhvV\fP] @@ -13,6 +14,9 @@ numad [\fI\-D non-standard-cgroup-mount-point\fP] numad [\fI\-i [min_interval:]max_interval\fP] .br .LP +numad [\fI\-K 0|1\fP] +.br +.LP numad [\fI\-l log_level\fP] .br .LP @@ -36,11 +40,18 @@ numad [\fI\-x PID\fP] .SH "DESCRIPTION" .LP -Numad is a system daemon that monitors NUMA topology and usage. It will attempt -to locate processes for optimum NUMA locality and affinity, dynamically -adjusting to changing system conditions. Numad also provides guidance to assist -management applications with initial manual binding of CPU and memory resources -for their processes. +Numad is a system daemon that monitors NUMA topology and resource usage. It +will attempt to locate processes for efficient NUMA locality and affinity, +dynamically adjusting to changing system conditions. Numad also provides +guidance to assist management applications with initial manual binding of CPU +and memory resources for their processes. Note that numad is primarily +intended for server consolidation environments, where there might be multiple +applications or multiple virtual guests running on the same server system. +Numad is most likely to have a positive effect when processes can be localized +in a subset of the system's NUMA nodes. If the entire system is dedicated to a +large in-memory database application, for example -- especially if memory +accesses will likely remain unpredictable -- numad will probably not improve +performance. .SH "OPTIONS" .LP .TP @@ -61,6 +72,16 @@ Sets the time interval that numad waits between system scans, in seconds to cause the daemon to exit. (This is the normal mechanism to terminate the daemon.) A bigger <\fImax_interval\fP> will decrease numad overhead but also decrease responsiveness to changing loads. +.TP +\fB\-K\fR <\fI0|1\fP> +This option controls whether numad keeps interleaved memory spread across NUMA +nodes, or attempts to merge interleaved memory to local NUMA nodes. The +default is to merge interleaved memory. This is the appropriate setting to +localize processes in a subset of the system's NUMA nodes. If you are running +a large, single-instance application that allocates interleaved memory because +the workload will have continuous unpredictable memory access patterns (e.g. a +large in-memory database), you might get better results by specifying \fI\-K +1\fP to instruct numad to keep interleaved memory distributed. .TP \fB\-l\fR <\fIlog_level\fP> Sets the log level to <\fIlog_level\fP>. Reasonable choices are 5, 6, or 7. @@ -69,15 +90,15 @@ The default value is 5. \fB\-p\fR <\fIPID\fP> Add PID to explicit inclusion list of processes to consider for managing, if the process also uses significant resources. Multiple \fI\-p PID\fP options -can be specified at daemon start, but after deamon start, only one PID can be +can be specified at daemon start, but after daemon start, only one PID can be added to the inclusion list per subsequent numad invocation. Use with \-S to precisely control the scope of processes numad can manage. Note that the specified process will not necessarily be actively managed unless it also meets -numad's significance threshold -- which is currently 300MB and half a CPU. +numad's significance threshold -- which is currently 300MB and half of a CPU. .TP \fB\-r\fR <\fIPID\fP> Remove PID from both the explicit inclusion and the exclusion lists of -processes. After deamon start, only one PID can be removed from the explicit +processes. After daemon start, only one PID can be removed from the explicit process lists per subsequent numad invocation. Use with \-S and \-p and \-x to precisely control the scope of processes numad can manage. .TP @@ -110,7 +131,7 @@ Queries numad for the best NUMA nodes to bind an entity that needs be specified as well <\fI:MB\fP> so numad can recommend NUMA nodes with available CPU capacity and adequate free memory. This query option can be used regardless of whether numad is running as a daemon. (An invocation using this -option when numad is not running as a daemon, will not cause the deamon to +option when numad is not running as a daemon, will not cause the daemon to start.) Output of this option is a string that contains a NUMA node list. For example: 2\-3,6. The recommended node list could be saved in a shell variable (e.g., NODES) and then used as the node list parameter in a @@ -122,7 +143,7 @@ command. See numactl(8). \fB\-x\fR <\fIPID\fP> Add PID to explicit exclusion list of processes to blacklist from managing. Multiple \fI\-x PID\fP options can be specified at daemon start, but after -deamon start, only one PID can be added to the exclusion list per subsequent +daemon start, only one PID can be added to the exclusion list per subsequent numad invocation. Use with \-S to precisely control the scope of processes numad can manage. .SH "FILES" diff --git a/numad.c b/numad.c index 7857d8c..9262cab 100644 --- a/numad.c +++ b/numad.c @@ -54,7 +54,7 @@ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #include -#define VERSION_STRING "20121015" +#define VERSION_STRING "20121130" #define VAR_RUN_FILE "/var/run/numad.pid" @@ -112,6 +112,7 @@ int min_interval = MIN_INTERVAL; int max_interval = MAX_INTERVAL; int target_utilization = TARGET_UTILIZATION_PERCENT; int scan_all_processes = 1; +int keep_interleaved_memory = 0; pthread_mutex_t pid_list_mutex; pthread_mutex_t node_info_mutex; @@ -197,7 +198,7 @@ void init_msg_queue() { msg_qid = msgget(msg_key, msg_flg); if (msg_qid < 0) { numad_log(LOG_CRIT, "msgget failed\n"); - exit(EXIT_FAILURE); + exit(EXIT_FAILURE); } flush_msg_queue(); } @@ -205,7 +206,7 @@ void init_msg_queue() { void recv_msg(msg_p m) { if (msgrcv(msg_qid, m, sizeof(msg_body_t), getpid(), 0) < 0) { numad_log(LOG_CRIT, "msgrcv failed\n"); - exit(EXIT_FAILURE); + exit(EXIT_FAILURE); } // printf("Received: >>%s<< from process %d\n", m->body.text, m->body.src_pid); } @@ -273,8 +274,8 @@ typedef struct id_list { int add_ids_to_list_from_str(id_list_p list_p, char *s) { if (list_p == NULL) { - numad_log(LOG_CRIT, "Cannot add to NULL list\n"); - exit(EXIT_FAILURE); + numad_log(LOG_CRIT, "Cannot add to NULL list\n"); + exit(EXIT_FAILURE); } if ((s == NULL) || (strlen(s) == 0)) { goto return_list; @@ -308,8 +309,8 @@ return_list: int str_from_id_list(char *str_p, int str_size, id_list_p list_p) { char *p = str_p; if ((p == NULL) || (str_size < 3)) { - numad_log(LOG_CRIT, "Bad string for ID listing\n"); - exit(EXIT_FAILURE); + numad_log(LOG_CRIT, "Bad string for ID listing\n"); + exit(EXIT_FAILURE); } int n; if ((list_p == NULL) || ((n = NUM_IDS_IN_LIST(list_p)) == 0)) { @@ -378,7 +379,7 @@ typedef struct process_data { // Hash table size must always be a power of two -#define MIN_PROCESS_HASH_TABLE_SIZE 64 +#define MIN_PROCESS_HASH_TABLE_SIZE 16 int process_hash_table_size = 0; int process_hash_collisions = 0; process_data_p process_hash_table = NULL; @@ -467,6 +468,7 @@ int process_hash_update(process_data_p newp) { return new_hash_table_entry; } + int process_hash_rehash(int old_ix) { // Given the index of a table entry that would otherwise be orphaned by // process_hash_remove(), reinsert into table using PID from existing record. @@ -486,12 +488,16 @@ int process_hash_remove(int pid) { if (ix >= 0) { // remove the target process_data_p dp = &process_hash_table[ix]; - if (dp->comm) { free(dp->comm); } + if (dp->comm) { free(dp->comm); } if (dp->cpuset_name) { free(dp->cpuset_name); } - // if (dp->node_list_p) { FREE_LIST(dp->node_list_p); } memset(dp, 0, sizeof(process_data_t)); - // bubble up the collision chain - while ((pid = process_hash_table[++ix].pid) > 0) { + // bubble up the collision chain and rehash if neeeded + for (;;) { + ix += 1; + ix &= (process_hash_table_size - 1); + if ((pid = process_hash_table[ix].pid) <= 0) { + break; + } if (process_hash_lookup(pid) < 0) { if (process_hash_rehash(ix) < 0) { numad_log(LOG_ERR, "rehash fail\n"); @@ -512,6 +518,7 @@ void process_hash_table_expand() { } else { process_hash_table_size = MIN_PROCESS_HASH_TABLE_SIZE; } + numad_log(LOG_DEBUG, "Expanding hash table size: %d\n", process_hash_table_size); process_hash_table = malloc(process_hash_table_size * sizeof(process_data_t)); if (process_hash_table == NULL) { numad_log(LOG_CRIT, "hash table malloc failed\n"); @@ -531,6 +538,18 @@ void process_hash_table_expand() { } } +void process_hash_table_dump() { + for (int ix = 0; (ix < process_hash_table_size); ix++) { + process_data_p p = &process_hash_table[ix]; + if (p->pid) { + numad_log(LOG_DEBUG, + "ix: %d PID: %d %s Thds: %d CPU %ld MBs: %ld Data TS: %ld Bind TS: %ld\n", + ix, p->pid, ((p->comm != NULL) ? p->comm : "(Null)"), p->num_threads, + p->CPUs_used, p->MBs_used, p->data_time_stamp, p->bind_time_stamp); + } + } +} + void process_hash_table_cleanup(uint64_t update_time) { int cpusets_removed = 0; int num_hash_entries_used = 0; @@ -591,8 +610,10 @@ pid_list_p insert_pid_into_pid_list(pid_list_p list_ptr, long pid) { if (process_hash_table != NULL) { int hash_ix = process_hash_lookup(pid); if ((hash_ix >= 0) && (list_ptr == include_pid_list)) { - // Clear dup_bind_count, in case user wants it to be re-evaluated soon + // Clear dup_bind_count and interleaved flag, + // in case user wants it to be re-evaluated soon process_hash_table[hash_ix].dup_bind_count = 0; + process_hash_table[hash_ix].flags &= ~PROCESS_FLAG_INTERLEAVED; } } // Check for duplicate pid first @@ -661,6 +682,8 @@ void print_usage_and_exit(char *prog_name) { fprintf(stderr, "-D to specify cgroup mount point\n"); fprintf(stderr, "-h to print this usage info\n"); fprintf(stderr, "-i [:] to specify interval seconds\n"); + fprintf(stderr, "-K 1 to keep interleaved memory spread across nodes\n"); + fprintf(stderr, "-K 0 to merge interleaved memory to local NUMA nodes\n"); fprintf(stderr, "-l to specify logging level (usually 5, 6, or 7)\n"); fprintf(stderr, "-p to add PID to inclusion pid list\n"); fprintf(stderr, "-r to remove PID from explicit pid lists\n"); @@ -724,7 +747,7 @@ void check_prereqs(char *prog_name) { fprintf(stderr, "Looks like transparent hugepage scan time in %s is %d ms.\n", thp_scan_fname, ms); fprintf(stderr, "Consider increasing the frequency of THP scanning,\n"); fprintf(stderr, "by echoing a smaller number (e.g. 100) to %s\n", thp_scan_fname); - fprintf(stderr, "to more agressively (re)construct THPs. For example:\n"); + fprintf(stderr, "to more aggressively (re)construct THPs. For example:\n"); fprintf(stderr, "# echo 100 > /sys/kernel/mm/redhat_transparent_hugepage/khugepaged/scan_sleep_millisecs\n"); fprintf(stderr, "\n"); } @@ -857,19 +880,19 @@ int get_huge_page_size_in_bytes() { int huge_page_size = 0;; FILE *fs = fopen("/proc/meminfo", "r"); if (!fs) { - numad_log(LOG_CRIT, "Can't open /proc/meminfo\n"); - exit(EXIT_FAILURE); + numad_log(LOG_CRIT, "Can't open /proc/meminfo\n"); + exit(EXIT_FAILURE); } char buf[BUF_SIZE]; while (fgets(buf, BUF_SIZE, fs)) { - if (!strncmp("Hugepagesize", buf, 12)) { - char *p = &buf[12]; - while ((!isdigit(*p)) && (p < buf + BUF_SIZE)) { - p++; - } - huge_page_size = atoi(p); - break; - } + if (!strncmp("Hugepagesize", buf, 12)) { + char *p = &buf[12]; + while ((!isdigit(*p)) && (p < buf + BUF_SIZE)) { + p++; + } + huge_page_size = atoi(p); + break; + } } fclose(fs); return huge_page_size * KILOBYTE; @@ -1099,8 +1122,8 @@ int node_and_digits(const struct dirent *dptr) { if (*p++ != 'd') return 0; if (*p++ != 'e') return 0; do { - if (!isdigit(*p++)) - return 0; + if (!isdigit(*p++)) + return 0; } while (*p != '\0'); return 1; } @@ -1458,11 +1481,23 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { int num_existing_mems = 0; static id_list_p existing_mems_list_p; CLEAR_LIST(existing_mems_list_p); + uint64_t time_stamp = get_time_stamp(); static node_data_p tmp_node; static uint64_t *process_MBs; static uint64_t *saved_magnitude_for_node; static int process_MBs_num_nodes; - uint64_t time_stamp = get_time_stamp(); + // See if dynamic structures need to grow. + if (process_MBs_num_nodes < num_nodes + 1) { + process_MBs_num_nodes = num_nodes + 1; + // The "+1 node" is for accumulating interleaved memory + process_MBs = realloc(process_MBs, process_MBs_num_nodes * sizeof(uint64_t)); + tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) ); + saved_magnitude_for_node = realloc(saved_magnitude_for_node, num_nodes * sizeof(uint64_t)); + if ((process_MBs == NULL) || (tmp_node == NULL) || (saved_magnitude_for_node == NULL)) { + numad_log(LOG_CRIT, "process_MBs realloc failed\n"); + exit(EXIT_FAILURE); + } + } // For existing processes, get miscellaneous process specific details int pid_ix; process_data_p p = NULL; @@ -1487,7 +1522,7 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { } if (!fgets(buf, BUF_SIZE, fs)) { numad_log(LOG_WARNING, "Tried to research PID %d cpuset, but it apparently went away.\n", p->pid); - fclose(fs); + fclose(fs); return NULL; // Assume the process terminated? } fclose(fs); @@ -1569,18 +1604,6 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { // is expensive and should be minimized. Also, old kernels dismantle // transparent huge pages while producing the numa_maps memory // information! - // Check to see if dynamic structures need to grow. - if (process_MBs_num_nodes < num_nodes + 1) { - process_MBs_num_nodes = num_nodes + 1; - // The "+1 node" is for accumulating interleaved memory - process_MBs = realloc(process_MBs, process_MBs_num_nodes * sizeof(uint64_t)); - tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) ); - saved_magnitude_for_node = realloc(saved_magnitude_for_node, num_nodes * sizeof(uint64_t)); - if ((process_MBs == NULL) || (tmp_node == NULL) || (saved_magnitude_for_node == NULL)) { - numad_log(LOG_CRIT, "process_MBs realloc failed\n"); - exit(EXIT_FAILURE); - } - } memset(process_MBs, 0, process_MBs_num_nodes * sizeof(uint64_t)); snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid); fs = fopen(fname, "r"); @@ -1626,8 +1649,9 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, process_MBs[ix]); } } - if (process_has_interleaved_memory) { - // Mark this process as having interleaved memory, and stamp it as done. + if ((process_has_interleaved_memory) && (keep_interleaved_memory)) { + // Mark this process as having interleaved memory so we do not + // merge the interleaved memory. Time stamp it as done. p->flags |= PROCESS_FLAG_INTERLEAVED; p->bind_time_stamp = get_time_stamp(); if (log_level >= LOG_DEBUG) { @@ -1690,8 +1714,15 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { int prev_node_used = -1; // Continue to allocate more resources until request are met. // OK if not not quite all the CPU request is met. - // FIXME: ?? Is the following too much CPU flexing? - while ((mbs > 0) || (cpus > (tmp_node[0].CPUs_total / 4))) { + // FIXME: ?? Is half of the utilization margin a good amount of CPU flexing? + int cpu_flex = ((100 - target_utilization) * tmp_node[0].CPUs_total) / 200; + if (pid <= 0) { + // If trying to find resources for pre-placement advice request, do not + // underestimate the amount of CPUs needed. Instead, err on the side + // of providing too many resources. So, no flexing here... + cpu_flex = 0; + } + while ((mbs > 0) || (cpus > cpu_flex)) { if (log_level >= LOG_DEBUG) { numad_log(LOG_DEBUG, "MBs: %d, CPUs: %d\n", mbs, cpus); } @@ -1834,6 +1865,10 @@ id_list_p pick_numa_nodes(int pid, int cpus, int mbs) { return NULL; } } + if ((pid <= 0) && (num_target_nodes <= 0)) { + // Always provide at least one node for pre-placement advice + ADD_ID_TO_LIST(node[0].node_id, target_node_list_p); + } try_memory_move_again: str_from_id_list(buf, BUF_SIZE, existing_mems_list_p); str_from_id_list(buf2, BUF_SIZE, target_node_list_p); @@ -1974,89 +2009,112 @@ void *set_dynamic_options(void *arg) { // int arg_value = *(int *)arg; char buf[BUF_SIZE]; for (;;) { - // Loop here forever waiting for a msg to do something... - msg_t msg; - recv_msg(&msg); - switch (msg.body.cmd) { - case 'i': - min_interval = msg.body.arg1; - max_interval = msg.body.arg2; - if (max_interval <= 0) { - shut_down_numad(); - } - numad_log(LOG_NOTICE, "Changing interval to %d:%d\n", msg.body.arg1, msg.body.arg2); - break; - case 'l': - numad_log(LOG_NOTICE, "Changing log level to %d\n", msg.body.arg1); - log_level = msg.body.arg1; - break; - case 'p': - numad_log(LOG_NOTICE, "Adding PID %d to inclusion PID list\n", msg.body.arg1); - pthread_mutex_lock(&pid_list_mutex); - exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, msg.body.arg1); - include_pid_list = insert_pid_into_pid_list(include_pid_list, msg.body.arg1); - pthread_mutex_unlock(&pid_list_mutex); - break; - case 'r': - numad_log(LOG_NOTICE, "Removing PID %d from explicit PID lists\n", msg.body.arg1); - pthread_mutex_lock(&pid_list_mutex); - include_pid_list = remove_pid_from_pid_list(include_pid_list, msg.body.arg1); - exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, msg.body.arg1); - pthread_mutex_unlock(&pid_list_mutex); - break; - case 'S': - scan_all_processes = (msg.body.arg1 != 0); - if (scan_all_processes) { - numad_log(LOG_NOTICE, "Scanning all processes\n"); - } else { - numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n"); - } - break; - case 'u': - numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1); - target_utilization = msg.body.arg1; - break; - case 'w': - numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", + // Loop here forever waiting for a msg to do something... + msg_t msg; + recv_msg(&msg); + switch (msg.body.cmd) { + case 'i': + min_interval = msg.body.arg1; + max_interval = msg.body.arg2; + if (max_interval <= 0) { + shut_down_numad(); + } + numad_log(LOG_NOTICE, "Changing interval to %d:%d\n", msg.body.arg1, msg.body.arg2); + break; + case 'K': + keep_interleaved_memory = (msg.body.arg1 != 0); + if (keep_interleaved_memory) { + numad_log(LOG_NOTICE, "Keeping interleaved memory spread across nodes\n"); + } else { + numad_log(LOG_NOTICE, "Merging interleaved memory to localized NUMA nodes\n"); + } + break; + case 'l': + numad_log(LOG_NOTICE, "Changing log level to %d\n", msg.body.arg1); + log_level = msg.body.arg1; + break; + case 'p': + numad_log(LOG_NOTICE, "Adding PID %d to inclusion PID list\n", msg.body.arg1); + pthread_mutex_lock(&pid_list_mutex); + exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, msg.body.arg1); + include_pid_list = insert_pid_into_pid_list(include_pid_list, msg.body.arg1); + pthread_mutex_unlock(&pid_list_mutex); + break; + case 'r': + numad_log(LOG_NOTICE, "Removing PID %d from explicit PID lists\n", msg.body.arg1); + pthread_mutex_lock(&pid_list_mutex); + include_pid_list = remove_pid_from_pid_list(include_pid_list, msg.body.arg1); + exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, msg.body.arg1); + pthread_mutex_unlock(&pid_list_mutex); + break; + case 'S': + scan_all_processes = (msg.body.arg1 != 0); + if (scan_all_processes) { + numad_log(LOG_NOTICE, "Scanning all processes\n"); + } else { + numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n"); + } + break; + case 'u': + numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1); + target_utilization = msg.body.arg1; + break; + case 'w': + numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", msg.body.arg1, msg.body.arg2); - pthread_mutex_lock(&node_info_mutex); - update_nodes(); - id_list_p node_list_p = pick_numa_nodes(-1, (msg.body.arg1 * ONE_HUNDRED), msg.body.arg2); - str_from_id_list(buf, BUF_SIZE, node_list_p); - pthread_mutex_unlock(&node_info_mutex); - send_msg(msg.body.src_pid, 'w', requested_cpus, requested_mbs, buf); - break; - case 'x': - numad_log(LOG_NOTICE, "Adding PID %d to exclusion PID list\n", msg.body.arg1); - pthread_mutex_lock(&pid_list_mutex); - include_pid_list = remove_pid_from_pid_list(include_pid_list, msg.body.arg1); - exclude_pid_list = insert_pid_into_pid_list(exclude_pid_list, msg.body.arg1); - pthread_mutex_unlock(&pid_list_mutex); - break; - default: - numad_log(LOG_WARNING, "Unexpected msg command: %c %d %d %s from PID %d\n", + pthread_mutex_lock(&node_info_mutex); + update_nodes(); + id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2); + str_from_id_list(buf, BUF_SIZE, node_list_p); + pthread_mutex_unlock(&node_info_mutex); + send_msg(msg.body.src_pid, 'w', 0, 0, buf); + break; + case 'x': + numad_log(LOG_NOTICE, "Adding PID %d to exclusion PID list\n", msg.body.arg1); + pthread_mutex_lock(&pid_list_mutex); + include_pid_list = remove_pid_from_pid_list(include_pid_list, msg.body.arg1); + exclude_pid_list = insert_pid_into_pid_list(exclude_pid_list, msg.body.arg1); + pthread_mutex_unlock(&pid_list_mutex); + break; + default: + numad_log(LOG_WARNING, "Unexpected msg command: %c %d %d %s from PID %d\n", msg.body.cmd, msg.body.arg1, msg.body.arg1, msg.body.text, msg.body.src_pid); - break; - } + break; + } } // for (;;) } -void parse_two_arg_values(char *p, int *first_ptr, int *second_ptr, int first_is_optional) { +void parse_two_arg_values(char *p, int *first_ptr, int *second_ptr, int first_is_optional, int first_scale_digits) { char *orig_p = p; char *q = NULL; int second = -1; - int first = (int)strtol(p, &p, 10); - if (p == orig_p) { + errno = 0; + int first = (int) strtol(p, &p, 10); + if ((errno != 0) || (p == orig_p) || (first < 0)) { fprintf(stderr, "Can't parse arg value(s): %s\n", orig_p); exit(EXIT_FAILURE); } + if (*p == '.') { + p++; + while ((first_scale_digits > 0) && (isdigit(*p))) { + first *= 10; + first += (*p++ - '0'); + first_scale_digits -= 1; + } + while (isdigit(*p)) { p++; } + } + while (first_scale_digits > 0) { + first *= 10; + first_scale_digits -= 1; + } if (*p == ':') { q = p + 1; - second = (int)strtol(q, &p, 10); - if (p == q) { + errno = 0; + second = (int) strtol(q, &p, 10); + if ((errno != 0) || (p == q) || (second < 0)) { fprintf(stderr, "Can't parse arg value(s): %s\n", orig_p); exit(EXIT_FAILURE); } @@ -2078,6 +2136,7 @@ int main(int argc, char *argv[]) { int opt; int d_flag = 0; int i_flag = 0; + int K_flag = 0; int l_flag = 0; int p_flag = 0; int r_flag = 0; @@ -2087,7 +2146,7 @@ int main(int argc, char *argv[]) { int w_flag = 0; int x_flag = 0; long list_pid = 0; - while ((opt = getopt(argc, argv, "dD:hi:l:p:r:S:u:vVw:x:")) != -1) { + while ((opt = getopt(argc, argv, "dD:hi:K:l:p:r:S:u:vVw:x:")) != -1) { switch (opt) { case 'd': d_flag = 1; @@ -2101,7 +2160,11 @@ int main(int argc, char *argv[]) { break; case 'i': i_flag = 1; - parse_two_arg_values(optarg, &min_interval, &max_interval, 1); + parse_two_arg_values(optarg, &min_interval, &max_interval, 1, 0); + break; + case 'K': + K_flag = 1; + keep_interleaved_memory = (atoi(optarg) != 0); break; case 'l': l_flag = 1; @@ -2137,7 +2200,7 @@ int main(int argc, char *argv[]) { break; case 'w': w_flag = 1; - parse_two_arg_values(optarg, &requested_cpus, &requested_mbs, 0); + parse_two_arg_values(optarg, &requested_cpus, &requested_mbs, 0, 2); break; case 'x': x_flag = 1; @@ -2151,8 +2214,8 @@ int main(int argc, char *argv[]) { } } if (argc > optind) { - fprintf(stderr, "Unexpected arg = %s\n", argv[optind]); - exit(EXIT_FAILURE); + fprintf(stderr, "Unexpected arg = %s\n", argv[optind]); + exit(EXIT_FAILURE); } if (i_flag) { if ((max_interval < min_interval) && (max_interval != 0)) { @@ -2174,6 +2237,9 @@ int main(int argc, char *argv[]) { if (i_flag) { send_msg(daemon_pid, 'i', min_interval, max_interval, ""); } + if (K_flag) { + send_msg(daemon_pid, 'K', keep_interleaved_memory, 0, ""); + } if (d_flag || l_flag || v_flag) { send_msg(daemon_pid, 'l', log_level, 0, ""); } @@ -2204,7 +2270,7 @@ int main(int argc, char *argv[]) { sleep(2); update_nodes(); numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs); - id_list_p node_list_p = pick_numa_nodes(-1, (requested_cpus * ONE_HUNDRED), requested_mbs); + id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs); str_from_id_list(buf, BUF_SIZE, node_list_p); fprintf(stdout, "%s\n", buf); close_log_file();