Skip to content

Commit f11b0c7

Browse files
authored
Merge pull request #2330 from jjhursey/topic/ibcast-non-uniform-dt-wa
coll/libnbc: Work around for non-uniform data types in ibcast
2 parents 8c07a7f + 350ef67 commit f11b0c7

File tree

3 files changed

+46
-10
lines changed

3 files changed

+46
-10
lines changed

ompi/mca/coll/libnbc/coll_libnbc.h

+3
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* reserved.
1616
* Copyright (c) 2014-2015 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
18+
* Copyright (c) 2016 IBM Corporation. All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -67,6 +68,8 @@ BEGIN_C_DECLS
6768
/* number of implemented collective functions */
6869
#define NBC_NUM_COLL 17
6970

71+
extern bool libnbc_ibcast_skip_dt_decision;
72+
7073
struct ompi_coll_libnbc_component_t {
7174
mca_coll_base_component_2_0_0_t super;
7275
opal_free_list_t requests;

ompi/mca/coll/libnbc/coll_libnbc_component.c

+22
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ const char *mca_coll_libnbc_component_version_string =
3939

4040

4141
static int libnbc_priority = 10;
42+
bool libnbc_ibcast_skip_dt_decision = true;
4243

4344

4445
static int libnbc_open(void);
@@ -131,6 +132,27 @@ libnbc_register(void)
131132
MCA_BASE_VAR_SCOPE_READONLY,
132133
&libnbc_priority);
133134

135+
/* ibcast decision function can make the wrong decision if a legal
136+
* non-uniform data type signature is used. This has resulted in the
137+
* collective operation failing, and possibly producing wrong answers.
138+
* We are investigating a fix for this problem, but it is taking a while.
139+
* https://github.com/open-mpi/ompi/issues/2256
140+
* https://github.com/open-mpi/ompi/issues/1763
141+
* As a result we are adding an MCA parameter to make a conservative
142+
* decision to avoid this issue. If the user knows that their application
143+
* does not use data types in this way, then they can set this parameter
144+
* to get the old behavior. Once the issue is truely fixed, then this
145+
* parameter can be removed.
146+
*/
147+
libnbc_ibcast_skip_dt_decision = true;
148+
(void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version,
149+
"ibcast_skip_dt_decision",
150+
"In ibcast only use size of communicator to choose algorithm, exclude data type signature. Set to 'false' to use data type signature in decision. WARNING: If you set this to 'false' then your application should not use non-uniform data type signatures in calls to ibcast.",
151+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
152+
OPAL_INFO_LVL_9,
153+
MCA_BASE_VAR_SCOPE_READONLY,
154+
&libnbc_ibcast_skip_dt_decision);
155+
134156
return OMPI_SUCCESS;
135157
}
136158

ompi/mca/coll/libnbc/nbc_ibcast.c

+21-10
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* and Technology (RIST). All rights reserved.
1010
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1111
* reserved.
12+
* Copyright (c) 2016 IBM Corporation. All rights reserved.
1213
*
1314
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
1415
*
@@ -65,16 +66,26 @@ int ompi_coll_libnbc_ibcast(void *buffer, int count, MPI_Datatype datatype, int
6566

6667
segsize = 16384;
6768
/* algorithm selection */
68-
if (p <= 4) {
69-
alg = NBC_BCAST_LINEAR;
70-
} else if (size * count < 65536) {
71-
alg = NBC_BCAST_BINOMIAL;
72-
} else if (size * count < 524288) {
73-
alg = NBC_BCAST_CHAIN;
74-
segsize = 8192;
75-
} else {
76-
alg = NBC_BCAST_CHAIN;
77-
segsize = 32768;
69+
if( libnbc_ibcast_skip_dt_decision ) {
70+
if (p <= 4) {
71+
alg = NBC_BCAST_LINEAR;
72+
}
73+
else {
74+
alg = NBC_BCAST_BINOMIAL;
75+
}
76+
}
77+
else {
78+
if (p <= 4) {
79+
alg = NBC_BCAST_LINEAR;
80+
} else if (size * count < 65536) {
81+
alg = NBC_BCAST_BINOMIAL;
82+
} else if (size * count < 524288) {
83+
alg = NBC_BCAST_CHAIN;
84+
segsize = 8192;
85+
} else {
86+
alg = NBC_BCAST_CHAIN;
87+
segsize = 32768;
88+
}
7889
}
7990

8091
#ifdef NBC_CACHE_SCHEDULE

0 commit comments

Comments
 (0)