Skip to content

Commit e56ee1e

Browse files
committed
Remove the remaining cruft from dual oob transport
* When we moved to allowing dual rml/oob transports, we added a bunch of stuff that is no longer needed. Remove it so as to simplify the messaging system. * Fix the routed/radix component so it correctly returns the parent's vpid Signed-off-by: Ralph Castain <rhc@pmix.org>
1 parent b80210c commit e56ee1e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+385
-1833
lines changed

orte/mca/errmgr/default_hnp/errmgr_default_hnp.c

+8-11
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
1010
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
12+
* Copyright (c) 2014-2019 Intel, Inc. All rights reserved.
1313
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1414
* Copyright (c) 2018 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
@@ -277,8 +277,7 @@ static void job_errors(int fd, short args, void *cbdata)
277277
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
278278
ORTE_JOBID_PRINT(jdata->jobid),
279279
ORTE_NAME_PRINT(&jdata->originator)));
280-
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
281-
&jdata->originator, answer,
280+
if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
282281
ORTE_RML_TAG_LAUNCH_RESP,
283282
orte_rml_send_callback, NULL))) {
284283
ORTE_ERROR_LOG(ret);
@@ -358,7 +357,6 @@ static void proc_errors(int fd, short args, void *cbdata)
358357
orte_proc_state_t state = caddy->proc_state;
359358
int i;
360359
int32_t i32, *i32ptr;
361-
char *rtmod;
362360

363361
ORTE_ACQUIRE_OBJECT(caddy);
364362

@@ -381,7 +379,6 @@ static void proc_errors(int fd, short args, void *cbdata)
381379
goto cleanup;
382380
}
383381
pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
384-
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
385382

386383
/* we MUST handle a communication failure before doing anything else
387384
* as it requires some special care to avoid normal termination issues
@@ -412,9 +409,9 @@ static void proc_errors(int fd, short args, void *cbdata)
412409
"%s Comm failure: daemons terminating - recording daemon %s as gone",
413410
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
414411
/* remove from dependent routes, if it is one */
415-
orte_routed.route_lost(rtmod, proc);
412+
orte_routed.route_lost(proc);
416413
/* if all my routes and local children are gone, then terminate ourselves */
417-
if (0 == orte_routed.num_routes(rtmod)) {
414+
if (0 == orte_routed.num_routes()) {
418415
for (i=0; i < orte_local_children->size; i++) {
419416
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
420417
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
@@ -435,7 +432,7 @@ static void proc_errors(int fd, short args, void *cbdata)
435432
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
436433
"%s Comm failure: %d routes remain alive",
437434
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
438-
(int)orte_routed.num_routes(rtmod)));
435+
(int)orte_routed.num_routes()));
439436
}
440437
goto cleanup;
441438
}
@@ -493,7 +490,7 @@ static void proc_errors(int fd, short args, void *cbdata)
493490
}
494491
/* if all my routes and children are gone, then terminate
495492
ourselves nicely (i.e., this is a normal termination) */
496-
if (0 == orte_routed.num_routes(rtmod)) {
493+
if (0 == orte_routed.num_routes()) {
497494
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
498495
"%s errmgr:default:hnp all routes gone - exiting",
499496
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@@ -718,7 +715,7 @@ static void proc_errors(int fd, short args, void *cbdata)
718715
default_hnp_abort(jdata);
719716
}
720717
/* remove from dependent routes, if it is one */
721-
orte_routed.route_lost(rtmod, proc);
718+
orte_routed.route_lost(proc);
722719
break;
723720

724721
case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
@@ -841,7 +838,7 @@ static void default_hnp_abort(orte_job_t *jdata)
841838
i32ptr = &i32;
842839
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32)) {
843840
/* warn user */
844-
orte_show_help("help-errmgr-base.txt", "normal-termination-but", true,
841+
orte_show_help("help-errmgr-base.txt", "normal-termination-but", true,
845842
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child",
846843
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
847844
i32, (1 == i32) ? "process returned\na non-zero exit code" :

orte/mca/errmgr/default_orted/errmgr_default_orted.c

+8-17
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,7 @@ static void orted_abort(int error_code, char *fmt, ...)
204204
}
205205

206206
/* send it */
207-
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
208-
ORTE_PROC_MY_HNP, alert,
207+
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
209208
ORTE_RML_TAG_PLM,
210209
orte_rml_send_callback, NULL))) {
211210
ORTE_ERROR_LOG(rc);
@@ -303,8 +302,7 @@ static void job_errors(int fd, short args, void *cbdata)
303302
goto cleanup;
304303
}
305304
/* send it */
306-
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
307-
ORTE_PROC_MY_HNP, alert,
305+
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
308306
ORTE_RML_TAG_PLM,
309307
orte_rml_send_callback, NULL))) {
310308
ORTE_ERROR_LOG(rc);
@@ -321,7 +319,6 @@ static void proc_errors(int fd, short args, void *cbdata)
321319
orte_job_t *jdata;
322320
orte_process_name_t *proc = &caddy->name;
323321
orte_proc_state_t state = caddy->proc_state;
324-
char *rtmod;
325322
orte_proc_t *child, *ptr;
326323
opal_buffer_t *alert;
327324
orte_plm_cmd_flag_t cmd;
@@ -386,9 +383,6 @@ static void proc_errors(int fd, short args, void *cbdata)
386383
goto cleanup;
387384
}
388385

389-
/* get our management conduit's routed module name */
390-
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
391-
392386
if (ORTE_PROC_STATE_COMM_FAILED == state) {
393387
/* if it is our own connection, ignore it */
394388
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
@@ -444,7 +438,7 @@ static void proc_errors(int fd, short args, void *cbdata)
444438
}
445439
/* if all my routes and children are gone, then terminate
446440
ourselves nicely (i.e., this is a normal termination) */
447-
if (0 == orte_routed.num_routes(rtmod)) {
441+
if (0 == orte_routed.num_routes()) {
448442
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
449443
"%s errmgr:default:orted all routes gone - exiting",
450444
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@@ -453,7 +447,7 @@ static void proc_errors(int fd, short args, void *cbdata)
453447
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
454448
"%s errmgr:default:orted not exiting, num_routes() == %d",
455449
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
456-
(int)orte_routed.num_routes(rtmod)));
450+
(int)orte_routed.num_routes()));
457451
}
458452
}
459453
/* if not, then we can continue */
@@ -513,8 +507,7 @@ static void proc_errors(int fd, short args, void *cbdata)
513507
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
514508
ORTE_NAME_PRINT(&child->name),
515509
jdata->num_local_procs));
516-
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
517-
ORTE_PROC_MY_HNP, alert,
510+
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
518511
ORTE_RML_TAG_PLM,
519512
orte_rml_send_callback, NULL))) {
520513
ORTE_ERROR_LOG(rc);
@@ -579,7 +572,7 @@ static void proc_errors(int fd, short args, void *cbdata)
579572
}
580573
/* if all my routes and children are gone, then terminate
581574
ourselves nicely (i.e., this is a normal termination) */
582-
if (0 == orte_routed.num_routes(rtmod)) {
575+
if (0 == orte_routed.num_routes()) {
583576
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
584577
"%s errmgr:default:orted all routes gone - exiting",
585578
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@@ -621,8 +614,7 @@ static void proc_errors(int fd, short args, void *cbdata)
621614
ORTE_NAME_PRINT(&child->name),
622615
jdata->num_local_procs));
623616
/* send it */
624-
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
625-
ORTE_PROC_MY_HNP, alert,
617+
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
626618
ORTE_RML_TAG_PLM,
627619
orte_rml_send_callback, NULL))) {
628620
ORTE_ERROR_LOG(rc);
@@ -677,8 +669,7 @@ static void proc_errors(int fd, short args, void *cbdata)
677669
OBJ_RELEASE(jdata);
678670

679671
/* send it */
680-
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
681-
ORTE_PROC_MY_HNP, alert,
672+
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
682673
ORTE_RML_TAG_PLM,
683674
orte_rml_send_callback, NULL))) {
684675
ORTE_ERROR_LOG(rc);

orte/mca/ess/base/ess_base_std_orted.c

+1-28
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ int orte_ess_base_orted_setup(void)
109109
hwloc_obj_t obj;
110110
unsigned i, j;
111111
orte_topology_t *t;
112-
opal_list_t transports;
113112
orte_ess_base_signal_t *sig;
114113
int idx;
115114

@@ -448,27 +447,6 @@ int orte_ess_base_orted_setup(void)
448447
goto error;
449448
}
450449

451-
/* get a conduit for our use - we never route IO over fabric */
452-
OBJ_CONSTRUCT(&transports, opal_list_t);
453-
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
454-
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
455-
if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) {
456-
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
457-
error = "orte_rml_open_mgmt_conduit";
458-
goto error;
459-
}
460-
OPAL_LIST_DESTRUCT(&transports);
461-
462-
OBJ_CONSTRUCT(&transports, opal_list_t);
463-
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
464-
ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
465-
if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) {
466-
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
467-
error = "orte_rml_open_coll_conduit";
468-
goto error;
469-
}
470-
OPAL_LIST_DESTRUCT(&transports);
471-
472450
/*
473451
* Group communications
474452
*/
@@ -609,10 +587,6 @@ int orte_ess_base_orted_finalize(void)
609587
pmix_server_finalize();
610588
(void) mca_base_framework_close(&opal_pmix_base_framework);
611589

612-
/* release the conduits */
613-
orte_rml.close_conduit(orte_mgmt_conduit);
614-
orte_rml.close_conduit(orte_coll_conduit);
615-
616590
/* close frameworks */
617591
(void) mca_base_framework_close(&orte_filem_base_framework);
618592
(void) mca_base_framework_close(&orte_grpcomm_base_framework);
@@ -695,8 +669,7 @@ static void signal_forward_callback(int fd, short event, void *arg)
695669
}
696670

697671
/* send it to ourselves */
698-
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
699-
ORTE_PROC_MY_NAME, cmd,
672+
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, cmd,
700673
ORTE_RML_TAG_DAEMON,
701674
NULL, NULL))) {
702675
ORTE_ERROR_LOG(rc);

orte/mca/ess/base/ess_base_std_tool.c

+4-14
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1313
* All rights reserved.
14-
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
1515
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
1616
*
1717
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
@@ -94,7 +94,6 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
9494
{
9595
int ret;
9696
char *error = NULL;
97-
opal_list_t transports;
9897
opal_list_t info;
9998
opal_value_t *kv, *knext, val;
10099
opal_pmix_query_t *q;
@@ -222,13 +221,6 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
222221
goto error;
223222
}
224223

225-
/* get a conduit for our use - we never route IO over fabric */
226-
OBJ_CONSTRUCT(&transports, opal_list_t);
227-
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
228-
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
229-
orte_mgmt_conduit = orte_rml.open_conduit(&transports);
230-
OPAL_LIST_DESTRUCT(&transports);
231-
232224
/* we -may- need to know the name of the head
233225
* of our session directory tree, particularly the
234226
* tmp base where any other session directories on
@@ -269,15 +261,15 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
269261
val.data.string = NULL;
270262
OBJ_DESTRUCT(&val);
271263
/* set the route to be direct */
272-
if (ORTE_SUCCESS != orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) {
264+
if (ORTE_SUCCESS != orte_routed.update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) {
273265
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri);
274266
orte_finalize();
275267
exit(1);
276268
}
277269

278270
/* connect to the HNP so we can recv forwarded output */
279271
buf = OBJ_NEW(opal_buffer_t);
280-
ret = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP,
272+
ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP,
281273
buf, ORTE_RML_TAG_WARMUP_CONNECTION,
282274
orte_rml_send_callback, NULL);
283275
if (ORTE_SUCCESS != ret) {
@@ -287,7 +279,7 @@ int orte_ess_base_tool_setup(opal_list_t *flags)
287279
}
288280

289281
/* set the target hnp as our lifeline so we will terminate if it exits */
290-
orte_routed.set_lifeline(NULL, ORTE_PROC_MY_HNP);
282+
orte_routed.set_lifeline(ORTE_PROC_MY_HNP);
291283

292284
/* setup the IOF */
293285
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) {
@@ -317,8 +309,6 @@ int orte_ess_base_tool_finalize(void)
317309
{
318310
orte_wait_finalize();
319311

320-
orte_rml.close_conduit(orte_mgmt_conduit);
321-
322312
/* if I am a tool, then all I will have done is
323313
* a very small subset of orte_init - ensure that
324314
* I only back those elements out

orte/mca/ess/hnp/ess_hnp_module.c

-26
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,6 @@ static int rte_init(void)
141141
uint32_t h;
142142
int idx;
143143
orte_topology_t *t;
144-
opal_list_t transports;
145144
orte_ess_base_signal_t *sig;
146145
opal_value_t val;
147146

@@ -370,27 +369,6 @@ static int rte_init(void)
370369
goto error;
371370
}
372371

373-
/* get a conduit for our use - we never route IO over fabric */
374-
OBJ_CONSTRUCT(&transports, opal_list_t);
375-
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
376-
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
377-
if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) {
378-
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
379-
error = "orte_rml_open_mgmt_conduit";
380-
goto error;
381-
}
382-
OPAL_LIST_DESTRUCT(&transports);
383-
384-
OBJ_CONSTRUCT(&transports, opal_list_t);
385-
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
386-
ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
387-
if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) {
388-
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
389-
error = "orte_rml_open_coll_conduit";
390-
goto error;
391-
}
392-
OPAL_LIST_DESTRUCT(&transports);
393-
394372
/* it is now safe to start the pmix server */
395373
pmix_server_start();
396374

@@ -776,10 +754,6 @@ static int rte_finalize(void)
776754
fflush(stdout);
777755
fflush(stderr);
778756

779-
/* release the conduits */
780-
orte_rml.close_conduit(orte_mgmt_conduit);
781-
orte_rml.close_conduit(orte_coll_conduit);
782-
783757
(void) mca_base_framework_close(&orte_iof_base_framework);
784758
(void) mca_base_framework_close(&orte_rtc_base_framework);
785759
(void) mca_base_framework_close(&orte_odls_base_framework);

orte/mca/filem/base/filem_base_receive.c

+3-5
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2016-2019 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -208,8 +208,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
208208
return;
209209
}
210210

211-
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
212-
sender, answer,
211+
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
213212
ORTE_RML_TAG_FILEM_BASE_RESP,
214213
orte_rml_send_callback, NULL))) {
215214
ORTE_ERROR_LOG(rc);
@@ -301,8 +300,7 @@ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender,
301300
goto CLEANUP;
302301
}
303302

304-
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
305-
sender, answer,
303+
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
306304
ORTE_RML_TAG_FILEM_BASE_RESP,
307305
orte_rml_send_callback, NULL))) {
308306
ORTE_ERROR_LOG(rc);

orte/mca/filem/raw/filem_raw_module.c

+2-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
33
* All rights reserved
44
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
5-
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
5+
* Copyright (c) 2014-2019 Intel, Inc. All rights reserved.
66
* Copyright (c) 2015-2017 Research Organization for Information Science
77
* and Technology (RIST). All rights reserved.
88
* $COPYRIGHT$
@@ -870,8 +870,7 @@ static void send_complete(char *file, int status)
870870
OBJ_RELEASE(buf);
871871
return;
872872
}
873-
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
874-
ORTE_PROC_MY_HNP, buf,
873+
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
875874
ORTE_RML_TAG_FILEM_BASE_RESP,
876875
orte_rml_send_callback, NULL))) {
877876
ORTE_ERROR_LOG(rc);

0 commit comments

Comments
 (0)