Skip to content

Commit 072cf8d

Browse files
committed
restoring more hwloc --cpu-set behavior from OMPI 3.x
My rough understanding of where some important masks come from is: rdata->available = OMPI's allowed PUs, possibly from --cpu-set cmdline hwloc_topology_get_allowed_cpuset() = hwloc maybe removing offline cpus and/or cpus not in our cgroup the old hwloc had fields obj->online_cpuset obj->allowed_cpuset that I'm guessing is similar to hwloc_topology_get_allowed_cpuset() Here's an example of the behavior this checkin changes: Suppose a machine's hardware has [ 0 1 2 3 / 4 5 6 7 / 8 9 10 11 / 12 13 14 15 ] and we run with --cpu-set 4,5,8,9,12,13,14,15 --map-by ppr:2:node:pe=3 -bind-to hwthread:overload-allowed The allowed hardware threads would be [..../xx../xx../xxxx] and the 3.x code would place the ranks on MCW 0 : [..../BB../B.../....] MCW 1 : [..../..../.B../BB..] which at least is using the allowed resources. The 4.x code doesn't pay as much attention to rdata->available and was placing the ranks on MCW 0 : [..../BBB./..../....] MCW 1 : [..../..../BBB./....] This part of the old code's behavior came from bind_downward() which would cycle through a sequence of potential target objects checking opal_hwloc_base_get_npus() under each potential object, and also using opal_hwloc_base_get_available_cpus() for a selected object. Those functions used to take into account rdata->available. The changes I've made are to maintain old behavior in opal_hwloc_base_get_npus(topo, obj) and to add back in the opal_hwloc_base_get_available_cpus() function that had been removed. The old behavior of opal_hwloc_base_get_npus() had two paths: 1. df_search_cores which counts how many cores exist under obj that intersect the hwloc_topology_get_allowed_cpuset() (I think it likely should also intersect with rdata->available, but it didn't before and I didn't change that aspect). Here the OMPI 4.x code just used hwloc_get_nbobjs_inside_cpuset_by_type(,CORE) so I restored the df_search_cores. It could likely be replaced by intersecting with the available/allowed masks and then using the get_nbobjs_inside.. call, but I went with the old code 2. opal_hwloc_base_get_available_cpus(topo,obj) that intersected the allowed_cpuset as well as the rdata->available. Here the OMPI 4.x code just used cpuset = obj->cpuset, so I restored the call to intersect it with the allowed_cpuset from hwloc and the rdata->available from ompi. For the use-case at the top of this PR this is all the changes needed. I did look a little further to see where else OMPI 3.x used opal_hwloc_base_get_available_cpus(), as I believe every place that removed that function is a place that used to account for rdata->available that now doesn't. So I also added opal_hwloc_base_get_available_cpus back into bind_in_place even though I wasn't tracing that codepath. I didn't restore all the uses I found in 3.x of the previously removed opal_hwloc_base_get_available_cpus() function, instead only touching code that seemed similar/related to code I was already touching.
1 parent bf3980d commit 072cf8d

File tree

6 files changed

+177
-10
lines changed

6 files changed

+177
-10
lines changed

opal/mca/hwloc/base/base.h

+2
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ OPAL_DECLSPEC int opal_hwloc_base_filter_cpus(hwloc_topology_t topo);
171171
* Free the hwloc topology.
172172
*/
173173
OPAL_DECLSPEC void opal_hwloc_base_free_topology(hwloc_topology_t topo);
174+
OPAL_DECLSPEC hwloc_cpuset_t opal_hwloc_base_get_available_cpus(hwloc_topology_t topo,
175+
hwloc_obj_t obj);
174176
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo,
175177
hwloc_obj_type_t target,
176178
unsigned cache_level,

opal/mca/hwloc/base/hwloc_base_frame.c

+8-1
Original file line numberDiff line numberDiff line change
@@ -423,14 +423,21 @@ char* opal_hwloc_base_print_locality(opal_hwloc_locality_t locality)
423423

424424
static void obj_data_const(opal_hwloc_obj_data_t *ptr)
425425
{
426+
ptr->available = NULL;
426427
ptr->npus_calculated = false;
427428
ptr->npus = 0;
428429
ptr->idx = UINT_MAX;
429430
ptr->num_bound = 0;
430431
}
432+
static void obj_data_dest(opal_hwloc_obj_data_t *ptr)
433+
{
434+
if (NULL != ptr->available) {
435+
hwloc_bitmap_free(ptr->available);
436+
}
437+
}
431438
OBJ_CLASS_INSTANCE(opal_hwloc_obj_data_t,
432439
opal_object_t,
433-
obj_data_const, NULL);
440+
obj_data_const, obj_data_dest);
434441

435442
static void sum_const(opal_hwloc_summary_t *ptr)
436443
{

opal/mca/hwloc/base/hwloc_base_util.c

+152-3
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,7 @@ void opal_hwloc_base_free_topology(hwloc_topology_t topo)
531531
void opal_hwloc_base_get_local_cpuset(void)
532532
{
533533
hwloc_obj_t root;
534+
hwloc_cpuset_t base_cpus;
534535

535536
if (NULL != opal_hwloc_topology) {
536537
if (NULL == opal_hwloc_my_cpuset) {
@@ -543,7 +544,8 @@ void opal_hwloc_base_get_local_cpuset(void)
543544
HWLOC_CPUBIND_PROCESS) < 0) {
544545
/* we are not bound - use the root's available cpuset */
545546
root = hwloc_get_root_obj(opal_hwloc_topology);
546-
hwloc_bitmap_copy(opal_hwloc_my_cpuset, root->cpuset);
547+
base_cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root);
548+
hwloc_bitmap_copy(opal_hwloc_my_cpuset, base_cpus);
547549
}
548550
}
549551
}
@@ -571,6 +573,77 @@ int opal_hwloc_base_report_bind_failure(const char *file,
571573
return OPAL_SUCCESS;
572574
}
573575

576+
hwloc_cpuset_t opal_hwloc_base_get_available_cpus(hwloc_topology_t topo,
577+
hwloc_obj_t obj)
578+
{
579+
hwloc_obj_t root;
580+
hwloc_cpuset_t avail, specd=NULL;
581+
opal_hwloc_topo_data_t *rdata;
582+
opal_hwloc_obj_data_t *data;
583+
584+
OPAL_OUTPUT_VERBOSE((10, opal_hwloc_base_framework.framework_output,
585+
"hwloc:base: get available cpus"));
586+
587+
/* get the node-level information */
588+
root = hwloc_get_root_obj(topo);
589+
rdata = (opal_hwloc_topo_data_t*)root->userdata;
590+
/* bozo check */
591+
if (NULL == rdata) {
592+
rdata = OBJ_NEW(opal_hwloc_topo_data_t);
593+
root->userdata = (void*)rdata;
594+
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
595+
"hwloc:base:get_available_cpus first time - filtering cpus"));
596+
}
597+
598+
/* are we asking about the root object? */
599+
if (obj == root) {
600+
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
601+
"hwloc:base:get_available_cpus root object"));
602+
return rdata->available;
603+
}
604+
605+
/* some hwloc object types don't have cpus */
606+
// This used to use online_cpuset:
607+
if (NULL == obj->cpuset) {
608+
return NULL;
609+
}
610+
if (!hwloc_bitmap_intersects(obj->cpuset, hwloc_topology_get_allowed_cpuset(topo))) {
611+
return NULL;
612+
}
613+
614+
/* see if we already have this info */
615+
if (NULL == (data = (opal_hwloc_obj_data_t*)obj->userdata)) {
616+
/* nope - create the object */
617+
data = OBJ_NEW(opal_hwloc_obj_data_t);
618+
obj->userdata = (void*)data;
619+
}
620+
621+
/* do we have the cpuset */
622+
if (NULL != data->available) {
623+
return data->available;
624+
}
625+
626+
/* find the available processors on this object */
627+
avail = hwloc_bitmap_alloc();
628+
// This used to use online_cpuset:
629+
hwloc_bitmap_and(avail, obj->cpuset, hwloc_topology_get_allowed_cpuset(topo));
630+
631+
/* filter this against the node-available processors */
632+
if (NULL == rdata->available) {
633+
hwloc_bitmap_free(avail);
634+
return NULL;
635+
}
636+
specd = hwloc_bitmap_alloc();
637+
hwloc_bitmap_and(specd, avail, rdata->available);
638+
639+
/* cache the info */
640+
data->available = specd;
641+
642+
/* cleanup */
643+
hwloc_bitmap_free(avail);
644+
return specd;
645+
}
646+
574647
/* determine if there is a single cpu in a bitmap */
575648
bool opal_hwloc_base_single_cpu(hwloc_cpuset_t cpuset)
576649
{
@@ -599,6 +672,50 @@ bool opal_hwloc_base_single_cpu(hwloc_cpuset_t cpuset)
599672
return one;
600673
}
601674

675+
static void df_search_cores(hwloc_topology_t topo, hwloc_obj_t obj, unsigned int *cnt)
676+
{
677+
unsigned k;
678+
opal_hwloc_obj_data_t *data;
679+
680+
// I'm updating df_search_cores a bit to deal with virtual elements in a tree,
681+
// these are objects that have no arity and aren't really in the tree, but
682+
// they're associated with a real tree element, their parent. For the purposes
683+
// of tree-walking we should use the parent.
684+
//
685+
// It's likely we don't even need this routine and should just use a bitwise
686+
// intersection plus hwloc_get_nbobjs_inside_cpuset_by_type(,,CORE) but for
687+
// now I was just sticking close to the 3.x code
688+
while (obj->depth < 0) {
689+
obj = obj->parent;
690+
}
691+
if (HWLOC_OBJ_CORE == obj->type) {
692+
data = (opal_hwloc_obj_data_t*)obj->userdata;
693+
if (NULL == data) {
694+
data = OBJ_NEW(opal_hwloc_obj_data_t);
695+
obj->userdata = (void*)data;
696+
}
697+
if (NULL == opal_hwloc_base_cpu_list) {
698+
if (!hwloc_bitmap_intersects(obj->cpuset, hwloc_topology_get_allowed_cpuset(topo))) {
699+
700+
/*
701+
* do not count not allowed cores (e.g. cores with zero allowed PU)
702+
* if SMT is enabled, do count cores with at least one allowed hwthread
703+
*/
704+
return;
705+
}
706+
data->npus = 1;
707+
}
708+
*cnt += data->npus;
709+
return;
710+
}
711+
712+
for (k=0; k < obj->arity; k++) {
713+
df_search_cores(topo, obj->children[k], cnt);
714+
}
715+
return;
716+
}
717+
718+
602719
/* get the number of pu's under a given hwloc object */
603720
unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
604721
hwloc_obj_t obj)
@@ -616,15 +733,47 @@ unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
616733
* one hwthread/core. Instead, find the number of cores
617734
* in the system
618735
*/
619-
cnt = hwloc_get_nbobjs_inside_cpuset_by_type(topo, obj->cpuset, HWLOC_OBJ_CORE);
736+
// The OMPI 3.x code used df_search_cores() which counts the number
737+
// of objects of type CORE under obj if the object intersected
738+
// obj->allowed_cpuset which was a hwloc field that left out things
739+
// like offline cpus or cpus not in our cgroup.
740+
//
741+
// The OMPI 4.x code initially moved to a vanilla
742+
// cnt = hwloc_get_nbobjs_inside_cpuset_by_type(topo, obj->cpuset, HWLOC_OBJ_CORE);
743+
//
744+
// I'm currently restoring it to df_search_cores() and intersecting
745+
// hwloc_topology_get_allowed_cpuset(topo) to decide if a CORE obj
746+
// should be counted. I think it ought to also be intersecting with
747+
// rdata->available (eg if the user specified a --cpu-set to constrain
748+
// the binding to) but for now I'm just restoring the old behavior.
749+
if (NULL != hwloc_get_obj_by_type(topo, HWLOC_OBJ_CORE, 0)) {
750+
/* starting at the incoming obj, do a down-first search
751+
* and count the number of cores under it
752+
*/
753+
cnt = 0;
754+
df_search_cores(topo, obj, &cnt);
755+
}
756+
620757
} else {
621758
hwloc_cpuset_t cpuset;
622759

623760
/* if we are treating cores as cpus, or the system can't detect
624761
* "cores", then get the available cpuset for this object - this will
625762
* create and store the data
626763
*/
627-
if (NULL == (cpuset = obj->cpuset)) {
764+
// The OMPI 3.x code used cpuset = opal_hwloc_base_get_available_cpus() which
765+
// intersected obj->cpuset with all of obj->online_cpuset, obj->allowed_cpuset
766+
// (the old hwloc objects that omitted offline out-of-our-cgroup cpus)
767+
// and rdata->available (eg from mpirun cmdline --cpu-set).
768+
//
769+
// The OMPI 4.x code initially moved to a vanilla
770+
// cpuset = obj->cpuset
771+
//
772+
// I'm restoring it to opal_hwloc_base_get_available_cpus() and intersecting
773+
// hwloc_topology_get_allowed_cpuset(topo) to decide if a CORE obj
774+
// should be counted, and still using the rdata->available there too.
775+
776+
if (NULL == (cpuset = opal_hwloc_base_get_available_cpus(topo, obj))) {
628777
return 0;
629778
}
630779
/* count the number of bits that are set - there is

opal/mca/hwloc/hwloc-internal.h

+1
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ typedef uint8_t opal_hwloc_resource_type_t;
139139
/* structs for storing info on objects */
140140
typedef struct {
141141
opal_object_t super;
142+
hwloc_cpuset_t available;
142143
bool npus_calculated;
143144
unsigned int npus;
144145
unsigned int idx;

orte/mca/rmaps/base/rmaps_base_binding.c

+7-2
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ static int bind_generic(orte_job_t *jdata,
130130
orte_job_map_t *map;
131131
orte_proc_t *proc;
132132
hwloc_obj_t trg_obj, tmp_obj, nxt_obj;
133+
hwloc_cpuset_t cpus;
133134
unsigned int ncpus;
134135
opal_hwloc_obj_data_t *data;
135136
int total_cpus;
@@ -248,7 +249,9 @@ static int bind_generic(orte_job_t *jdata,
248249
}
249250
}
250251
/* bind the proc here */
251-
hwloc_bitmap_or(totalcpuset, totalcpuset, trg_obj->cpuset);
252+
cpus = opal_hwloc_base_get_available_cpus(node->topology->topo, trg_obj);
253+
hwloc_bitmap_or(totalcpuset, totalcpuset, cpus);
254+
252255
/* track total #cpus */
253256
total_cpus += ncpus;
254257
/* move to the next location, in case we need it */
@@ -301,6 +304,7 @@ static int bind_in_place(orte_job_t *jdata,
301304
orte_job_map_t *map;
302305
orte_node_t *node;
303306
orte_proc_t *proc;
307+
hwloc_cpuset_t cpus;
304308
unsigned int idx, ncpus;
305309
struct hwloc_topology_support *support;
306310
opal_hwloc_obj_data_t *data;
@@ -487,7 +491,8 @@ static int bind_in_place(orte_job_t *jdata,
487491
ORTE_NAME_PRINT(&proc->name),
488492
hwloc_obj_type_string(locale->type), idx);
489493
/* bind the proc here */
490-
hwloc_bitmap_list_asprintf(&cpu_bitmap, locale->cpuset);
494+
cpus = opal_hwloc_base_get_available_cpus(node->topology->topo, locale);
495+
hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus);
491496
orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
492497
/* update the location, in case it changed */
493498
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, locale, OPAL_PTR);

orte/mca/rmaps/ppr/rmaps_ppr.c

+7-4
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ static void prune(orte_jobid_t jobid,
443443
hwloc_obj_type_t lvl;
444444
unsigned cache_level = 0, k;
445445
int nprocs;
446-
hwloc_cpuset_t avail;
446+
hwloc_cpuset_t avail, cpus, childcpus;
447447
int n, limit, nmax, nunder, idx, idxmax = 0;
448448
orte_proc_t *proc, *pptr, *procmax;
449449
opal_hwloc_level_t ll;
@@ -494,7 +494,7 @@ static void prune(orte_jobid_t jobid,
494494
lvl, cache_level,
495495
i, OPAL_HWLOC_AVAILABLE);
496496
/* get the available cpuset */
497-
avail = obj->cpuset;
497+
avail = opal_hwloc_base_get_available_cpus(node->topology->topo, obj);
498498

499499
/* look at the intersection of this object's cpuset and that
500500
* of each proc in the job/app - if they intersect, then count this proc
@@ -514,7 +514,8 @@ static void prune(orte_jobid_t jobid,
514514
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
515515
return;
516516
}
517-
if (hwloc_bitmap_intersects(avail, locale->cpuset)) {
517+
cpus = opal_hwloc_base_get_available_cpus(node->topology->topo, locale);
518+
if (hwloc_bitmap_intersects(avail, cpus)) {
518519
nprocs++;
519520
}
520521
}
@@ -551,6 +552,7 @@ static void prune(orte_jobid_t jobid,
551552
/* find the child with the most procs underneath it */
552553
for (k=0; k < top->arity && limit < nprocs; k++) {
553554
/* get this object's available cpuset */
555+
childcpus = opal_hwloc_base_get_available_cpus(node->topology->topo, top->children[k]);
554556
nunder = 0;
555557
pptr = NULL;
556558
for (n=0; n < node->procs->size; n++) {
@@ -566,7 +568,8 @@ static void prune(orte_jobid_t jobid,
566568
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
567569
return;
568570
}
569-
if (hwloc_bitmap_intersects(top->children[k]->cpuset, locale->cpuset)) {
571+
cpus = opal_hwloc_base_get_available_cpus(node->topology->topo, locale);
572+
if (hwloc_bitmap_intersects(childcpus, cpus)) {
570573
nunder++;
571574
if (NULL == pptr) {
572575
/* save the location of the first proc under this object */

0 commit comments

Comments
 (0)