From 9c1f649739a2e9f1ea20158893a732aba1eb83bc Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 15 Nov 2016 19:27:26 -0700 Subject: [PATCH] Fix debugger attach and cospawn of debugger daemons for the STAT debugger. Add ability to test the support minus the actual debugger. Fixes #2411 Continue cleanup of STAT debugger attach: * Limit the number of times we retry sending of a message to avoid an infinite loop * Don't execute the "init_debugger_after_spawn" state for debugger jobs * Add a new test program "attach" that takes the debugger attach fifo as its argument, and then simulates attach by writing a byte down the fifo Output the attach fifo info if we are testing attach so we know where to attach to - otherwise, use the output_verbose Always send "debugger release" to the job actually being debugged, not the debugger itself Signed-off-by: Ralph Castain Remove debug Signed-off-by: Ralph Castain --- .gitignore | 1 + ompi/mca/rte/orte/rte_orte_module.c | 10 +- orte/mca/oob/base/oob_base_stubs.c | 14 +- orte/mca/plm/base/plm_base_launch_support.c | 9 +- orte/mca/rml/base/base.h | 4 +- orte/mca/rml/base/rml_base_frame.c | 10 +- orte/mca/routed/radix/routed_radix.c | 6 +- orte/mca/schizo/base/schizo_base_stubs.c | 13 +- orte/mca/state/base/state_base_fns.c | 22 +-- orte/runtime/orte_mca_params.c | 4 +- orte/test/mpi/Makefile | 2 +- orte/test/mpi/attach.c | 30 ++++ orte/tools/orterun/orterun.c | 154 ++++++++++++++------ orte/util/error_strings.c | 4 +- 14 files changed, 206 insertions(+), 77 deletions(-) create mode 100644 orte/test/mpi/attach.c diff --git a/.gitignore b/.gitignore index b3524bc3ff3..d14b27001d7 100644 --- a/.gitignore +++ b/.gitignore @@ -341,6 +341,7 @@ orte/mca/sstore/orte_sstore.7 orte/test/mpi/abort orte/test/mpi/accept +orte/test/mpi/attach orte/test/mpi/bad_exit orte/test/mpi/bcast_loop orte/test/mpi/concurrent_spawn diff --git a/ompi/mca/rte/orte/rte_orte_module.c b/ompi/mca/rte/orte/rte_orte_module.c index ce35edfda46..8112510aa1c 100644 --- a/ompi/mca/rte/orte/rte_orte_module.c +++ b/ompi/mca/rte/orte/rte_orte_module.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2012-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2012-2014 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -104,6 +104,8 @@ void ompi_rte_wait_for_debugger(void) { int debugger; orte_rml_recv_cb_t xfer; + char *evar; + int time; /* See lengthy comment in orte/tools/orterun/debuggers.c about orte_in_parallel_debugger */ @@ -123,6 +125,12 @@ void ompi_rte_wait_for_debugger(void) */ ompi_debugger_setup_dlls(); + if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) { + time = strtol(evar, NULL, 10); + sleep(time); + return; + } + if (orte_standalone_operation) { /* spin until debugger attaches and releases us */ while (MPIR_debug_gate == 0) { diff --git a/orte/mca/oob/base/oob_base_stubs.c b/orte/mca/oob/base/oob_base_stubs.c index 3032451c38f..9adcf7e3900 100644 --- a/orte/mca/oob/base/oob_base_stubs.c +++ b/orte/mca/oob/base/oob_base_stubs.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -117,9 +117,16 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata) * this is a local proc we just haven't heard from * yet due to a race condition. Check that situation */ if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { - ORTE_OOB_SEND(msg); - return; + ++msg->retries; + if (msg->retries < orte_rml_base.max_retries) { + ORTE_OOB_SEND(msg); + return; + } } + opal_output_verbose(5, orte_oob_base_framework.framework_output, + "%s CANNOT SEND TO %s: TAG %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&msg->dst), msg->tag); msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN; ORTE_RML_SEND_COMPLETE(msg); return; @@ -396,4 +403,3 @@ static void process_uri(char *uri) } opal_argv_free(uris); } - diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index c159fe5757a..c09d027c0a8 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -13,7 +13,7 @@ * Copyright (c) 2009 Institut National de Recherche en Informatique * et Automatique. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. @@ -757,9 +757,10 @@ void orte_plm_base_registered(int fd, short args, void *cbdata) } cleanup: - /* need to init_after_spawn for debuggers */ - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); - + /* if this wasn't a debugger job, then need to init_after_spawn for debuggers */ + if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); + } OBJ_RELEASE(caddy); } diff --git a/orte/mca/rml/base/base.h b/orte/mca/rml/base/base.h index 6b29d07f626..9c5cb594d0b 100644 --- a/orte/mca/rml/base/base.h +++ b/orte/mca/rml/base/base.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -84,6 +84,7 @@ ORTE_DECLSPEC void orte_rml_base_comm_stop(void); typedef struct { opal_list_t posted_recvs; opal_list_t unmatched_msgs; + int max_retries; #if OPAL_ENABLE_TIMING bool timing; #endif @@ -123,6 +124,7 @@ typedef struct { orte_process_name_t origin; int status; // returned status on send orte_rml_tag_t tag; // targeted tag + int retries; // #times we have tried to send it /* user's send callback functions and data */ union { diff --git a/orte/mca/rml/base/rml_base_frame.c b/orte/mca/rml/base/rml_base_frame.c index 33dcbde6fa8..6e410168fa5 100644 --- a/orte/mca/rml/base/rml_base_frame.c +++ b/orte/mca/rml/base/rml_base_frame.c @@ -61,6 +61,14 @@ static int orte_rml_base_register(mca_base_register_flag_t flags) &orte_rml_base_wrapper); (void) mca_base_var_register_synonym(var_id, "orte", "rml",NULL,"wrapper", 0); + orte_rml_base.max_retries = 3; + mca_base_var_register("orte", "rml", "base", "max_retries", + "Max #times to retry sending a message", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &orte_rml_base.max_retries); + #if OPAL_ENABLE_TIMING orte_rml_base.timing = false; (void) mca_base_var_register ("orte", "rml", "base", "timing", @@ -259,6 +267,7 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender, /*** RML CLASS INSTANCES ***/ static void send_cons(orte_rml_send_t *ptr) { + ptr->retries = 0; ptr->cbdata = NULL; ptr->iov = NULL; ptr->buffer = NULL; @@ -325,4 +334,3 @@ static void prq_des(orte_rml_recv_request_t *ptr) OBJ_CLASS_INSTANCE(orte_rml_recv_request_t, opal_object_t, prq_cons, prq_des); - diff --git a/orte/mca/routed/radix/routed_radix.c b/orte/mca/routed/radix/routed_radix.c index e342c1cdafe..86ba76828ff 100644 --- a/orte/mca/routed/radix/routed_radix.c +++ b/orte/mca/routed/radix/routed_radix.c @@ -6,7 +6,7 @@ * reserved. * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -372,6 +372,10 @@ static orte_process_name_t get_route(orte_process_name_t *target) daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) { + opal_output_verbose(2, orte_routed_base_framework.framework_output, + "%s ATTEMPTING TO SEND TO %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(target)); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ret = ORTE_NAME_INVALID; goto found; diff --git a/orte/mca/schizo/base/schizo_base_stubs.c b/orte/mca/schizo/base/schizo_base_stubs.c index a2e5fe1bf2c..96fbba01b8e 100644 --- a/orte/mca/schizo/base/schizo_base_stubs.c +++ b/orte/mca/schizo/base/schizo_base_stubs.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -22,7 +22,6 @@ int orte_schizo_base_parse_cli(char *personality, orte_schizo_base_active_module_t *mod; if (NULL == personality) { - opal_output(0, "NULL PERSONALITY"); return ORTE_ERR_NOT_SUPPORTED; } @@ -63,6 +62,11 @@ int orte_schizo_base_setup_fork(orte_job_t *jdata, int rc; orte_schizo_base_active_module_t *mod; + /* if no personality was specified, then nothing to do */ + if (NULL == jdata->personality) { + return ORTE_SUCCESS; + } + OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { if (0 == strcmp(jdata->personality, mod->component->mca_component_name)) { if (NULL != mod->module->setup_fork) { @@ -81,6 +85,11 @@ int orte_schizo_base_setup_child(orte_job_t *jdata, int rc; orte_schizo_base_active_module_t *mod; + /* if no personality was specified, then nothing to do */ + if (NULL == jdata->personality) { + return ORTE_SUCCESS; + } + OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { if (0 == strcmp(jdata->personality, mod->component->mca_component_name)) { if (NULL != mod->module->setup_child) { diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c index 7af1cfd5dcf..273b20d6dbf 100644 --- a/orte/mca/state/base/state_base_fns.c +++ b/orte/mca/state/base/state_base_fns.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2011-2012 Los Alamos National Security, LLC. - * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -521,13 +521,13 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata) /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; - if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { + if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); - } + } /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) @@ -550,11 +550,11 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata) } /* return the allocated slot for reuse */ cleanup_node(pdata); - /* track job status */ - jdata->num_terminated++; - if (jdata->num_terminated == jdata->num_procs) { + /* track job status */ + jdata->num_terminated++; + if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); - } + } } cleanup: @@ -752,10 +752,10 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata) * is maintained! */ if (1 < j) { - if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { - /* this was a debugger daemon. notify that a debugger has detached */ - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH); - } + if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { + /* this was a debugger daemon. notify that a debugger has detached */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH); + } opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ OBJ_RELEASE(jdata); } diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 69c0c7ee02d..0fdc1db91d3 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -13,7 +13,7 @@ * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012-2013 Los Alamos National Security, LLC. * All rights reserved - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -276,7 +276,7 @@ int orte_register_params(void) "Test debugger colaunch after debugger attachment", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &orte_debugger_test_daemon); + &orte_debugger_test_attach); orte_debugger_check_rate = 0; (void) mca_base_var_register ("orte", "orte", NULL, "debugger_check_rate", diff --git a/orte/test/mpi/Makefile b/orte/test/mpi/Makefile index 8dd29b0c1c6..3a0074aa325 100644 --- a/orte/test/mpi/Makefile +++ b/orte/test/mpi/Makefile @@ -1,4 +1,4 @@ -PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll +PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach all: $(PROGS) diff --git a/orte/test/mpi/attach.c b/orte/test/mpi/attach.c new file mode 100644 index 00000000000..adbe00e2deb --- /dev/null +++ b/orte/test/mpi/attach.c @@ -0,0 +1,30 @@ +/* -*- C -*- + * + * $HEADER$ + * + * The most basic of MPI applications + */ + +#include +#include +#include +#include +#include +#include + +int main(int argc, char* argv[]) +{ + unsigned char fifo_cmd = 1; + int fd; + + if (1 > argc) { + fprintf(stderr, "usage: attach \n"); + exit(1); + } + + fd = open(argv[1], O_WRONLY); + write(fd, &fifo_cmd, sizeof(unsigned char)); + close(fd); + + return 0; +} diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 796f59d5d6b..6951367cabe 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -578,7 +578,7 @@ static void run_next_job(int fd, short args, void *cbdata) int orterun(int argc, char *argv[]) { - int rc; + int rc, i; opal_cmd_line_t cmd_line; char *param; orte_job_t *daemons; @@ -1024,6 +1024,20 @@ int orterun(int argc, char *argv[]) } } + /* check for debugger test envars and forward them if necessary */ + if (NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) { + char *evar; + evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"); + for (i=0; i < (int)jdata->num_apps; i++) { + if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + opal_setenv("ORTE_TEST_DEBUGGER_ATTACH", "1", true, &app->env); + if (NULL != evar) { + opal_setenv("ORTE_TEST_DEBUGGER_SLEEP", evar, true, &app->env); + } + } + } + } + /* check for suicide test directives */ if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") || NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) { @@ -2135,7 +2149,7 @@ static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, { int i, id, ret; char **new_argv = NULL; - const char **tmp; + const char **tmp=NULL; char *value, **lines, *env_name; /* Get the orte_base_debug MCA parameter and search for a debugger @@ -2210,7 +2224,7 @@ static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, * - fills in the table MPIR_proctable, and sets MPIR_proctable_size * - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1) * - calls MPIR_Breakpoint() which the debugger will have a - * breakpoint on. + * breakpoint on. * * b) Applications start and then spin until MPIR_debug_gate is set * non-zero by the debugger. @@ -2321,12 +2335,13 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata) if (!MPIR_being_debugged && !orte_in_parallel_debugger) { /* if we were given a test debugger, then we still want to - * colaunch it + * colaunch it - unless we are testing attach to a running job */ - if (NULL != orte_debugger_test_daemon) { + if (NULL != orte_debugger_test_daemon && !orte_debugger_test_attach) { opal_output_verbose(2, orte_debug_output, - "%s No debugger test daemon specified", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + "%s Debugger test daemon specified: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_debugger_test_daemon); goto launchit; } /* if we were given an auto-detect rate, then we want to setup @@ -2450,6 +2465,9 @@ static void setup_debugger_job(void) proc = OBJ_NEW(orte_proc_t); proc->name.jobid = debugger->jobid; proc->name.vpid = vpid++; + /* point the proc at the local ORTE daemon as its parent */ + proc->parent = node->daemon->name.vpid; + /* set the local/node ranks - we don't actually care * what these are, but the odls needs them */ @@ -2490,7 +2508,7 @@ static bool mpir_breakpoint_fired = false; void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; - orte_job_t *jdata = caddy->jdata; + orte_job_t *jdata = caddy->jdata, *target; orte_proc_t *proc; orte_app_context_t *appctx; orte_vpid_t i, j; @@ -2498,35 +2516,55 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) int rc; char **aliases, *aptr; + opal_output_verbose(5, orte_debug_output, + "%s INIT AFTER SPAWN FOR %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(caddy->jdata->jobid)); + /* if we couldn't get thru the mapper stage, we might * enter here with no procs. Avoid the "zero byte malloc" * message by checking here */ if (MPIR_proctable || 0 == jdata->num_procs) { + /* already initialized */ opal_output_verbose(5, orte_debug_output, "%s: debugger already initialized or zero procs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - OBJ_RELEASE(caddy); - if (!mpir_breakpoint_fired) { - /* record that we have triggered the debugger */ - mpir_breakpoint_fired = true; - - /* trigger the debugger */ - MPIR_Breakpoint(); - /* send a message to rank=0 to release it */ - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) || - ORTE_PROC_STATE_UNTERMINATED < proc->state ) { - /* proc is already dead */ - return; - } - buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */ - if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, - ORTE_RML_TAG_DEBUGGER_RELEASE, - orte_rml_send_callback, NULL))) { - opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); - OBJ_RELEASE(buf); + if (MPIR_being_debugged || NULL != orte_debugger_test_daemon || + NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) { + OBJ_RELEASE(caddy); + if (!mpir_breakpoint_fired) { + /* record that we have triggered the debugger */ + mpir_breakpoint_fired = true; + + /* trigger the debugger */ + MPIR_Breakpoint(); + + /* send a message to rank=0 of the job being debugged to release it */ + target = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1); + if (NULL == target) { + /* the job is dead */ + return; + } + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(target->procs, 0)) || + ORTE_PROC_STATE_UNTERMINATED < proc->state ) { + /* proc is already dead */ + return; + } + buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */ + opal_output_verbose(5, orte_debug_output, + "%s SENDING DEBUGGER RELEASE TO %s %s:%d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name), + __FILE__, __LINE__); + if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, + ORTE_RML_TAG_DEBUGGER_RELEASE, + orte_rml_send_callback, NULL))) { + opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); + OBJ_RELEASE(buf); + } } } return; @@ -2621,8 +2659,13 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) /* trigger the debugger */ MPIR_Breakpoint(); - /* send a message to rank=0 to release it */ - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) || + /* send a message to rank=0 of the job being debugged to release it */ + target = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1); + if (NULL == target) { + /* the job is dead */ + return; + } + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(target->procs, 0)) || ORTE_PROC_STATE_UNTERMINATED < proc->state) { /* proc is already dead or never registered with us (so we don't have * contact info for him) @@ -2630,9 +2673,10 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) return; } opal_output_verbose(2, orte_debug_output, - "%s sending debugger release to %s", + "%s SENDING DEBUGGER RELEASE TO %s %s:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name)); + ORTE_NAME_PRINT(&proc->name), + __FILE__, __LINE__); buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */ if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, ORTE_RML_TAG_DEBUGGER_RELEASE, @@ -2640,7 +2684,7 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); OBJ_RELEASE(buf); } - } else { + } else if (!orte_debugger_test_attach) { /* if I am launching debugger daemons, then I need to do so now * that the job has been started and I know which nodes have * apps on them @@ -2673,14 +2717,14 @@ static void orte_debugger_detached(int fd, short event, void *cbdata) static void open_fifo (void) { if (attach_fd > 0) { - close(attach_fd); + close(attach_fd); } attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0); if (attach_fd < 0) { - opal_output(0, "%s unable to open debugger attach fifo", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; + opal_output(0, "%s unable to open debugger attach fifo", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + return; } /* Set this fd to be close-on-exec so that children don't see it */ @@ -2692,10 +2736,16 @@ static void open_fifo (void) return; } - opal_output_verbose(2, orte_debug_output, - "%s Monitoring debugger attach fifo %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - MPIR_attach_fifo); + if (orte_debugger_test_attach) { + opal_output(0, "%s Monitoring debugger attach fifo %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + MPIR_attach_fifo); + } else { + opal_output_verbose(2, orte_debug_output, + "%s Monitoring debugger attach fifo %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + MPIR_attach_fifo); + } attach = (opal_event_t*)malloc(sizeof(opal_event_t)); opal_event_set(orte_event_base, attach, attach_fd, OPAL_EV_READ, attach_debugger, attach); @@ -2703,6 +2753,8 @@ static void open_fifo (void) opal_event_add(attach, 0); } +static bool did_once = false; + static void attach_debugger(int fd, short event, void *arg) { unsigned char fifo_cmd; @@ -2712,16 +2764,16 @@ static void attach_debugger(int fd, short event, void *arg) if (fifo_active) { attach = (opal_event_t*)arg; - fifo_active = false; + fifo_active = false; rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd)); - if (!rc) { - /* release the current event */ - opal_event_free(attach); - /* reopen device to clear hangup */ - open_fifo(); - return; - } + if (!rc) { + /* release the current event */ + opal_event_free(attach); + /* reopen device to clear hangup */ + open_fifo(); + return; + } if (1 != fifo_cmd) { /* ignore the cmd */ fifo_active = true; @@ -2758,6 +2810,12 @@ static void attach_debugger(int fd, short event, void *arg) (NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon); setup_debugger_job(); + did_once = true; + } + + /* if we are testing, ensure we only do this once */ + if (NULL != orte_debugger_test_daemon && did_once) { + return; } /* reset the read or timer event */ diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index e8c64f4be23..df432a641a6 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -12,7 +12,7 @@ * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -306,6 +306,8 @@ const char *orte_job_state_to_str(orte_job_state_t state) return "FAULT TOLERANCE RESTART"; case ORTE_JOB_STATE_ANY: return "ANY"; + case ORTE_JOB_STATE_DEBUGGER_DETACH: + return "DEBUGGER DETACHED"; default: return "UNKNOWN STATE!"; }