WIP/MEDIUM: cfgparse: remote numa & thread-count detection

Not needed anymore since already done before landing here.

NOTE that the cmp_hw_cpus() function is here!
This commit is contained in:
Willy Tarreau 2023-07-20 17:22:35 +02:00
parent 404ff1c9d0
commit ec8eb37361

View File

@ -2475,413 +2475,6 @@ err:
return err_code;
}
#if defined(USE_THREAD) && defined USE_CPU_AFFINITY
#if defined(__linux__)
/* filter directory name of the pattern node<X> */
static int numa_filter(const struct dirent *dir)
{
char *endptr;
/* dir name must start with "node" prefix */
if (strncmp(dir->d_name, "node", 4))
return 0;
/* dir name must be at least 5 characters long */
if (!dir->d_name[4])
return 0;
/* dir name must end with a numeric id */
if (strtol(&dir->d_name[4], &endptr, 10) < 0 || *endptr)
return 0;
/* all tests succeeded */
return 1;
}
/* function used by qsort to compare two hwcpus and arrange them by vicinity.
* -1 says a<b, 1 says a>b.
*/
static int cmp_hw_cpus(const void *a, const void *b)
{
const struct ha_cpu_topo *l = (const struct ha_cpu_topo *)a;
const struct ha_cpu_topo *r = (const struct ha_cpu_topo *)b;
/* first, online vs offline */
if (!(l->st & (HA_CPU_F_OFFLINE | HA_CPU_F_EXCLUDED)) && (r->st & (HA_CPU_F_OFFLINE | HA_CPU_F_EXCLUDED)))
return -1;
if (!(r->st & (HA_CPU_F_OFFLINE | HA_CPU_F_EXCLUDED)) && (l->st & (HA_CPU_F_OFFLINE | HA_CPU_F_EXCLUDED)))
return 1;
/* next, package ID */
if (l->pk_id >= 0 && l->pk_id < r->pk_id)
return -1;
if (l->pk_id > r->pk_id && r->pk_id >= 0)
return 1;
/* next, node ID */
if (l->no_id >= 0 && l->no_id < r->no_id)
return -1;
if (l->no_id > r->no_id && r->no_id >= 0)
return 1;
/* next, L3 */
if (l->l3_id >= 0 && l->l3_id < r->l3_id)
return -1;
if (l->l3_id > r->l3_id && r->l3_id >= 0)
return 1;
/* next, cluster */
if (l->cl_id >= 0 && l->cl_id < r->cl_id)
return -1;
if (l->cl_id > r->cl_id && r->cl_id >= 0)
return 1;
/* next, L2 */
if (l->l2_id >= 0 && l->l2_id < r->l2_id)
return -1;
if (l->l2_id > r->l2_id && r->l2_id >= 0)
return 1;
/* next, thread set */
if (l->ts_id >= 0 && l->ts_id < r->ts_id)
return -1;
if (l->ts_id > r->ts_id && r->ts_id >= 0)
return 1;
/* next, L1 */
if (l->l1_id >= 0 && l->l1_id < r->l1_id)
return -1;
if (l->l1_id > r->l1_id && r->l1_id >= 0)
return 1;
/* next, IDX, so that SMT ordering is preserved */
if (l->idx >= 0 && l->idx < r->idx)
return -1;
if (l->idx > r->idx && r->idx >= 0)
return 1;
/* exactly the same (e.g. absent) */
return 0;
}
/* Inspect the cpu topology of the machine on startup. If a multi-socket
* machine is detected, try to bind on the first node with active cpu. This is
* done to prevent an impact on the overall performance when the topology of
* the machine is unknown. This function is not called if one of the conditions
* is met :
* - a non-null nbthread directive is active
* - a restrictive cpu-map directive is active
* - a restrictive affinity is already applied, for example via taskset
*
* Returns the count of cpus selected. If no automatic binding was required or
* an error occurred and the topology is unknown, 0 is returned.
*/
static int numa_detect_topology()
{
struct dirent **node_dirlist = NULL;
int node_dirlist_size = 0;
struct hap_cpuset node_cpu_set;
// const char *parse_cpu_set_args[2];
// struct ha_cpu_topo cpu_id = { }; /* all zeroes */
int maxcpus = 0;
int lastcpu = 0;
int grp, thr, cpu;
/* node_cpu_set count is used as return value */
ha_cpuset_zero(&node_cpu_set);
maxcpus = ha_cpuset_size();
for (cpu = 0; cpu < maxcpus; cpu++)
if (!(ha_cpu_topo[cpu].st & HA_CPU_F_OFFLINE))
lastcpu = cpu;
// /* now let's only focus on online and bound CPUs to learn more about
// * their topology, their siblings, their cache affinity etc. We can
// * stop at lastcpu which matches the ID of the last known bound CPU
// * when it's set. We'll pre-assign and auto-increment indexes for
// * thread_set_id, cluster_id, l1/l2/l3 id, etc. We don't revisit entries
// * already filled from the list provided by another CPU.
// */
// for (cpu = 0; cpu <= lastcpu; cpu++) {
// struct hap_cpuset cpus_list;
// int cpu2;
//
// if (ha_cpu_topo[cpu].st & HA_CPU_F_OFFLINE)
// continue;
//
// /* First, let's check the cache hierarchy. On systems exposing
// * it, index0 generally is the L1D cache, index1 the L1I, index2
// * the L2 and index3 the L3.
// */
//
// /* other CPUs sharing the same L1 cache (SMT) */
// if (ha_cpu_topo[cpu].l1_id < 0 &&
// read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/cache/index0/shared_cpu_list", cpu) == 0) {
// parse_cpu_set_args[0] = trash.area;
// parse_cpu_set_args[1] = "\0";
// if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
// for (cpu2 = 0; cpu2 <= lastcpu; cpu2++) {
// if (ha_cpuset_isset(&cpus_list, cpu2))
// ha_cpu_topo[cpu2].l1_id = cpu_id.l1_id;
// }
// cpu_id.l1_id++;
// }
// }
//
// /* other CPUs sharing the same L2 cache (clusters of cores) */
// if (ha_cpu_topo[cpu].l2_id < 0 &&
// read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/cache/index2/shared_cpu_list", cpu) == 0) {
// parse_cpu_set_args[0] = trash.area;
// parse_cpu_set_args[1] = "\0";
// if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
// for (cpu2 = 0; cpu2 <= lastcpu; cpu2++) {
// if (ha_cpuset_isset(&cpus_list, cpu2))
// ha_cpu_topo[cpu2].l2_id = cpu_id.l2_id;
// }
// cpu_id.l2_id++;
// }
// }
//
// /* other CPUs sharing the same L3 cache slices (local cores) */
// if (ha_cpu_topo[cpu].l3_id < 0 &&
// read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/cache/index3/shared_cpu_list", cpu) == 0) {
// parse_cpu_set_args[0] = trash.area;
// parse_cpu_set_args[1] = "\0";
// if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
// for (cpu2 = 0; cpu2 <= lastcpu; cpu2++) {
// if (ha_cpuset_isset(&cpus_list, cpu2))
// ha_cpu_topo[cpu2].l3_id = cpu_id.l3_id;
// }
// cpu_id.l3_id++;
// }
// }
//
// /* Now let's try to get more info about how the cores are
// * arranged in packages, clusters, cores, threads etc. It
// * overlaps a bit with the cache above, but as not all systems
// * provide all of these, they're quite complementary in fact.
// */
//
// /* threads mapped to same cores */
// if (ha_cpu_topo[cpu].ts_id < 0 &&
// read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/thread_siblings_list", cpu) == 0) {
// parse_cpu_set_args[0] = trash.area;
// parse_cpu_set_args[1] = "\0";
// if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
// for (cpu2 = 0; cpu2 <= lastcpu; cpu2++) {
// if (ha_cpuset_isset(&cpus_list, cpu2))
// ha_cpu_topo[cpu2].ts_id = cpu_id.ts_id;
// }
// cpu_id.ts_id++;
// }
// }
//
// /* clusters of cores when they exist, can be smaller and more
// * precise than core lists (e.g. big.little), otherwise use
// * core lists.
// */
// if (ha_cpu_topo[cpu].cl_id < 0 &&
// read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/cluster_cpus_list", cpu) == 0) {
// parse_cpu_set_args[0] = trash.area;
// parse_cpu_set_args[1] = "\0";
// if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
// for (cpu2 = 0; cpu2 <= lastcpu; cpu2++) {
// if (ha_cpuset_isset(&cpus_list, cpu2))
// ha_cpu_topo[cpu2].cl_id = cpu_id.cl_id;
// }
// cpu_id.cl_id++;
// }
// } else if (ha_cpu_topo[cpu].cl_id < 0 &&
// read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/core_siblings_list", cpu) == 0) {
// parse_cpu_set_args[0] = trash.area;
// parse_cpu_set_args[1] = "\0";
// if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
// for (cpu2 = 0; cpu2 <= lastcpu; cpu2++) {
// if (ha_cpuset_isset(&cpus_list, cpu2))
// ha_cpu_topo[cpu2].cl_id = cpu_id.cl_id;
// }
// cpu_id.cl_id++;
// }
// }
//
// /* package CPUs list, like nodes, are generally a hard limit
// * for groups, which must not span over multiple of them. On
// * some systems, the package_cpus_list is not always provided,
// * so we may fall back to the physical package id from each
// * CPU, whose number starts at 0. The first one is preferred
// * because it provides a list in a single read().
// */
// if (ha_cpu_topo[cpu].pk_id < 0 &&
// read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/package_cpus_list", cpu) == 0) {
// parse_cpu_set_args[0] = trash.area;
// parse_cpu_set_args[1] = "\0";
// if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
// for (cpu2 = 0; cpu2 <= lastcpu; cpu2++) {
// if (ha_cpuset_isset(&cpus_list, cpu2))
// ha_cpu_topo[cpu2].pk_id = cpu_id.pk_id;
// }
// cpu_id.pk_id++;
// }
// } else if (ha_cpu_topo[cpu].pk_id < 0 &&
// read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/physical_package_id", cpu) == 0) {
// if (trash.data)
// ha_cpu_topo[cpu].pk_id = str2uic(trash.area);
// }
// }
//
// /* Now locate NUMA node ID if any */
//
// node_dirlist = NULL;
// node_dirlist_size = scandir(NUMA_DETECT_SYSTEM_SYSFS_PATH"/node", &node_dirlist, numa_filter, alphasort);
//
// /* 3. loop through nodes dirs and find the first one with active cpus */
// while (node_dirlist_size-- > 0) {
// const char *node = node_dirlist[node_dirlist_size]->d_name;
//
// cpu_id.no_id = atoi(node + 4); // "nodeXXX"
// if (read_line_to_trash("%s/node/%s/cpulist",
// NUMA_DETECT_SYSTEM_SYSFS_PATH, node) == 0) {
// parse_cpu_set_args[0] = trash.area;
// parse_cpu_set_args[1] = "\0";
// if (parse_cpu_set(parse_cpu_set_args, &node_cpu_set, NULL) == 0) {
// for (cpu = 0; cpu < maxcpus; cpu++)
// if (ha_cpuset_isset(&node_cpu_set, cpu))
// ha_cpu_topo[cpu].no_id = cpu_id.no_id;
// }
// }
// free(node_dirlist[node_dirlist_size]);
// }
// ha_free(&node_dirlist);
qsort(ha_cpu_topo, lastcpu+1, sizeof(*ha_cpu_topo), cmp_hw_cpus);
for (cpu = 0; cpu <= lastcpu; cpu++) {
printf("thr %3d -> cpu %3d onl=%d bnd=%d pk=%02d no=%02d l3=%02d cl=%03d l2=%03d ts=%03d l1=%03d\n", cpu, ha_cpu_topo[cpu].idx,
!(ha_cpu_topo[cpu].st & HA_CPU_F_OFFLINE),
!(ha_cpu_topo[cpu].st & HA_CPU_F_EXCLUDED),
ha_cpu_topo[cpu].pk_id,
ha_cpu_topo[cpu].no_id,
ha_cpu_topo[cpu].l3_id,
ha_cpu_topo[cpu].cl_id,
ha_cpu_topo[cpu].l2_id,
ha_cpu_topo[cpu].ts_id,
ha_cpu_topo[cpu].l1_id);
}
skip_hw_cpus:
/* FIXME: Now figure a criterion for not proceeding below (e.g. found pk/no/l3/l2 above maybe) */
/* let's ignore restricted affinity */
if (thread_cpu_mask_forced() || cpu_map_configured())
goto free_scandir_entries;
/* 1. count the sysfs node<X> directories */
node_dirlist = NULL;
node_dirlist_size = scandir(NUMA_DETECT_SYSTEM_SYSFS_PATH"/node", &node_dirlist, numa_filter, alphasort);
if (node_dirlist_size <= 1)
goto free_scandir_entries;
/* 3. loop through nodes dirs and find the first one with active cpus */
while (node_dirlist_size--) {
const char *node = node_dirlist[node_dirlist_size]->d_name;
ha_cpuset_zero(&node_cpu_set);
if (read_line_to_trash("%s/node/%s/cpumap", NUMA_DETECT_SYSTEM_SYSFS_PATH, node)) {
ha_notice("Cannot read CPUs list of '%s', will not select them to refine binding\n", node);
free(node_dirlist[node_dirlist_size]);
continue;
}
parse_cpumap(trash.area, &node_cpu_set);
for (cpu = 0; cpu < maxcpus; cpu++)
if (ha_cpu_topo[cpu].st & HA_CPU_F_OFFLINE)
ha_cpuset_clr(&node_cpu_set, cpu);
/* 5. set affinity on the first found node with active cpus */
if (!ha_cpuset_count(&node_cpu_set)) {
free(node_dirlist[node_dirlist_size]);
continue;
}
ha_diag_warning("Multi-socket cpu detected, automatically binding on active CPUs of '%s' (%u active cpu(s))\n", node, ha_cpuset_count(&node_cpu_set));
for (grp = 0; grp < MAX_TGROUPS; grp++)
for (thr = 0; thr < MAX_THREADS_PER_GROUP; thr++)
ha_cpuset_assign(&cpu_map[grp].thread[thr], &node_cpu_set);
free(node_dirlist[node_dirlist_size]);
break;
}
free_scandir_entries:
while (node_dirlist_size-- > 0)
free(node_dirlist[node_dirlist_size]);
free(node_dirlist);
return ha_cpuset_count(&node_cpu_set);
}
#elif defined(__FreeBSD__)
static int numa_detect_topology()
{
struct hap_cpuset node_cpu_set;
int ndomains = 0, i;
size_t len = sizeof(ndomains);
int grp, thr;
ha_cpuset_zero(&node_cpu_set);
/* let's ignore restricted affinity */
if (thread_cpu_mask_forced() || cpu_map_configured())
goto leave;
if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) {
ha_notice("Cannot assess the number of CPUs domains\n");
return 0;
}
BUG_ON(ndomains > MAXMEMDOM);
if (ndomains < 2)
goto leave;
/*
* We retrieve the first active valid CPU domain
* with active cpu and binding it, we returns
* the number of cpu from the said domain
*/
for (i = 0; i < ndomains; i ++) {
struct hap_cpuset dom;
ha_cpuset_zero(&dom);
if (cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_DOMAIN, i, sizeof(dom.cpuset), &dom.cpuset) == -1)
continue;
if (!ha_cpuset_count(&dom))
continue;
ha_cpuset_assign(&node_cpu_set, &dom);
ha_diag_warning("Multi-socket cpu detected, automatically binding on active CPUs of '%d' (%u active cpu(s))\n", i, ha_cpuset_count(&node_cpu_set));
for (grp = 0; grp < MAX_TGROUPS; grp++)
for (thr = 0; thr < MAX_THREADS_PER_GROUP; thr++)
ha_cpuset_assign(&cpu_map[grp].thread[thr], &node_cpu_set);
break;
}
leave:
return ha_cpuset_count(&node_cpu_set);
}
#else
static int numa_detect_topology()
{
return 0;
}
#endif
#endif /* USE_THREAD && USE_CPU_AFFINITY */
/*
* Returns the error code, 0 if OK, or any combination of :
* - ERR_ABORT: must abort ASAP
@ -2921,39 +2514,9 @@ int check_config_validity()
if (!global.tune.requri_len)
global.tune.requri_len = REQURI_LEN;
if (!global.nbthread) {
/* nbthread not set, thus automatic. In this case, and only if
* running on a single process, we enable the same number of
* threads as the number of CPUs the process is bound to. This
* allows to easily control the number of threads using taskset.
*/
global.nbthread = 1;
#if defined(USE_THREAD)
{
int numa_cores = 0;
#if defined(USE_CPU_AFFINITY)
if (global.numa_cpu_mapping)
numa_cores = numa_detect_topology();
#endif
global.nbthread = numa_cores ? numa_cores :
thread_cpus_enabled_at_boot;
/* Note that we cannot have more than 32 or 64 threads per group */
if (!global.nbtgroups)
global.nbtgroups = 1;
if (global.nbthread > MAX_THREADS_PER_GROUP * global.nbtgroups) {
ha_diag_warning("nbthread not set, found %d CPUs, limiting to %d threads (maximum is %d per thread group). Please set nbthreads and/or increase thread-groups in the global section to silence this warning.\n",
global.nbthread, MAX_THREADS_PER_GROUP * global.nbtgroups, MAX_THREADS_PER_GROUP);
global.nbthread = MAX_THREADS_PER_GROUP * global.nbtgroups;
}
}
#endif
}
if (!global.nbtgroups)
global.nbtgroups = 1;
/* in the worst case these were supposed to be set in thread_detect_count() */
BUG_ON(!global.nbthread);
BUG_ON(!global.nbtgroups);
if (thread_map_to_groups() < 0) {
err_code |= ERR_ALERT | ERR_FATAL;