@@ -66,21 +66,22 @@ elseif ompiv == "3.1" or ompiv == "4.0" then
66
66
setenv (" RSNT_SLURM_MPI_TYPE" , slurm_pmi )
67
67
end
68
68
69
- if ompiv == " 3.1" then -- removed in 4.0
70
- setenv (" OMPI_MCA_mtl" , " ^mxm" )
71
- end
72
-
73
69
-- disable openib unconditionally, as it does not work very well with UCX
74
70
setenv (" OMPI_MCA_btl" , " ^openib" )
75
71
76
72
if os.getenv (" RSNT_INTERCONNECT" ) == " omnipath" then
77
73
setenv (" OMPI_MCA_pml" , " ^ucx,yalla" )
78
- if ompiv == " 3.1" then -- removed in 4.0
79
- setenv (" OMPI_MCA_oob" , " ^ud" )
74
+ if ompiv == " 3.1" then -- removed in 4.0; don't use ofi by default for cuda
75
+ setenv (" OMPI_MCA_mtl" , " ^mxm,ofi" )
76
+ setenv (" OMPI_MCA_oob" , " ^ud" )
80
77
end
81
78
setenv (" OMPI_MCA_coll" , " ^fca,hcoll" )
79
+ setenv (" OMPI_MCA_osc" , " ^ucx" )
82
80
else
83
81
setenv (" OMPI_MCA_pml" , " ^yalla" )
82
+ if ompiv == " 3.1" then -- removed in 4.0
83
+ setenv (" OMPI_MCA_mtl" , " ^mxm" )
84
+ end
84
85
-- avoids error messages about multicast, needs investigation
85
86
-- setenv("HCOLL_ENABLE_MCAST_ALL", "0")
86
87
-- we have multiple issues with the hcoll module, will need
0 commit comments