Skip to content

Commit 4f1eb09

Browse files
committed
Disable ofi for omnipath, to workaround it being selected by accident.
See open-mpi/ompi#4899 for details. The fix is in openmpi 4.0.1 but not 3.1.2 yet.
1 parent 3c541a4 commit 4f1eb09

File tree

1 file changed

+7
-6
lines changed

1 file changed

+7
-6
lines changed

lmod/openmpi_custom.lua

+7-6
Original file line numberDiff line numberDiff line change
@@ -66,21 +66,22 @@ elseif ompiv == "3.1" or ompiv == "4.0" then
6666
setenv("RSNT_SLURM_MPI_TYPE", slurm_pmi)
6767
end
6868

69-
if ompiv == "3.1" then -- removed in 4.0
70-
setenv("OMPI_MCA_mtl", "^mxm")
71-
end
72-
7369
-- disable openib unconditionally, as it does not work very well with UCX
7470
setenv("OMPI_MCA_btl", "^openib")
7571

7672
if os.getenv("RSNT_INTERCONNECT") == "omnipath" then
7773
setenv("OMPI_MCA_pml", "^ucx,yalla")
78-
if ompiv == "3.1" then -- removed in 4.0
79-
setenv("OMPI_MCA_oob", "^ud")
74+
if ompiv == "3.1" then -- removed in 4.0; don't use ofi by default for cuda
75+
setenv("OMPI_MCA_mtl", "^mxm,ofi")
76+
setenv("OMPI_MCA_oob", "^ud")
8077
end
8178
setenv("OMPI_MCA_coll", "^fca,hcoll")
79+
setenv("OMPI_MCA_osc", "^ucx")
8280
else
8381
setenv("OMPI_MCA_pml", "^yalla")
82+
if ompiv == "3.1" then -- removed in 4.0
83+
setenv("OMPI_MCA_mtl", "^mxm")
84+
end
8485
-- avoids error messages about multicast, needs investigation
8586
-- setenv("HCOLL_ENABLE_MCAST_ALL", "0")
8687
-- we have multiple issues with the hcoll module, will need

0 commit comments

Comments
 (0)