From b773f4ff34b180e0e8a7ca283d1ee0a80358ff16 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Thu, 28 Mar 2024 14:31:10 -0700 Subject: [PATCH] Add CommException class --- include/fenix.h | 2 + include/fenix_exception.hpp | 80 +++++++++++++++++ src/CMakeLists.txt | 1 + src/fenix_exception.cpp | 13 +++ test/CMakeLists.txt | 1 + test/exception_throw/CMakeLists.txt | 15 ++++ test/exception_throw/fenix_exceptions.cpp | 104 ++++++++++++++++++++++ 7 files changed, 216 insertions(+) create mode 100644 include/fenix_exception.hpp create mode 100644 src/fenix_exception.cpp create mode 100644 test/exception_throw/CMakeLists.txt create mode 100644 test/exception_throw/fenix_exceptions.cpp diff --git a/include/fenix.h b/include/fenix.h index 0a1d783..da01845 100644 --- a/include/fenix.h +++ b/include/fenix.h @@ -61,6 +61,8 @@ #include #if defined(c_plusplus) || defined(__cplusplus) +#include "fenix_exception.hpp" + extern "C" { #endif #include "fenix_data_subset.h" diff --git a/include/fenix_exception.hpp b/include/fenix_exception.hpp new file mode 100644 index 0000000..a7ad99b --- /dev/null +++ b/include/fenix_exception.hpp @@ -0,0 +1,80 @@ +/* +//@HEADER +// ************************************************************************ +// +// +// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _| +// _| _| _|_| _| _| _| _| +// _|_|_| _|_|_| _| _| _| _| _| +// _| _| _| _|_| _| _| _| +// _| _|_|_|_| _| _| _|_|_| _| _| +// +// +// +// +// Copyright (C) 2016 Rutgers University and Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, +// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock +// +// Questions? Contact Keita Teranishi (knteran@sandia.gov) and +// Marc Gamell (mgamell@cac.rutgers.edu) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef FENIX_EXCEPTION_HPP +#define FENIX_EXCEPTION_HPP + +#include +#include + +namespace Fenix { +struct CommException : public std::exception { + MPI_Comm repaired_comm; + const int fenix_err; + void* user_data; + CommException(MPI_Comm comm, int err, void* data = nullptr) : + repaired_comm(comm), fenix_err(err), user_data(data) { }; +}; + +//Registers a callback that throws a CommException after any Fenix-handled +// communication failure. +//Paramater user_data is passed to the exception +//Returns the callback ID. +int register_exception_callback(void* user_data = nullptr); + +} + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a33ea2c..e7eaa6e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,6 +16,7 @@ FILE(GLOB Fenix_HEADERS ${CMAKE_SOURCE_DIR}/include/*.h) set (Fenix_SOURCES fenix.cpp +fenix_exception.cpp fenix_mpi_override.cpp fenix_opt.cpp fenix_process_recovery.cpp diff --git a/src/fenix_exception.cpp b/src/fenix_exception.cpp new file mode 100644 index 0000000..2198059 --- /dev/null +++ b/src/fenix_exception.cpp @@ -0,0 +1,13 @@ +#include "fenix_exception.hpp" +#include "fenix.h" + +namespace Fenix { +int register_exception_callback(void* user_data){ + return Fenix_Callback_register( + [](MPI_Comm repaired_comm, int fen_err, void* data){ + throw CommException(repaired_comm, fen_err, data); + }, + user_data + ); +} +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c4f2e92..ba6f65c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(request_cancelled) add_subdirectory(no_jump) add_subdirectory(issend) add_subdirectory(failed_spares) +add_subdirectory(exception_throw) diff --git a/test/exception_throw/CMakeLists.txt b/test/exception_throw/CMakeLists.txt new file mode 100644 index 0000000..10b683b --- /dev/null +++ b/test/exception_throw/CMakeLists.txt @@ -0,0 +1,15 @@ +# +# This file is part of Fenix +# Copyright (c) 2016 Rutgers University and Sandia Corporation. +# This software is distributed under the BSD License. +# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +# the U.S. Government retains certain rights in this software. +# For more information, see the LICENSE file in the top Fenix +# directory. +# + +add_executable(fenix_exceptions fenix_exceptions.cpp) +target_link_libraries(fenix_exceptions fenix MPI::MPI_CXX) + +add_test(NAME exception_throw + COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 6 ${MPIEXEC_PREFLAGS} fenix_exceptions ${MPIEXEC_POSTFLAGS}) diff --git a/test/exception_throw/fenix_exceptions.cpp b/test/exception_throw/fenix_exceptions.cpp new file mode 100644 index 0000000..c8c3431 --- /dev/null +++ b/test/exception_throw/fenix_exceptions.cpp @@ -0,0 +1,104 @@ +/* +//@HEADER +// ************************************************************************ +// +// +// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _| +// _| _| _|_| _| _| _| _| +// _|_|_| _|_|_| _| _| _| _| _| +// _| _| _| _|_| _| _| _| +// _| _|_|_|_| _| _| _|_|_| _| _| +// +// +// +// +// Copyright (C) 2016 Rutgers University and Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, +// Michael Heroux, and Matthew Whitlock +// +// Questions? Contact Keita Teranishi (knteran@sandia.gov) and +// Marc Gamell (mgamell@cac.rutgers.edu) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#include +#include +#include +#include +#include +#include + +int main(int argc, char **argv) { + volatile int status = 0; + + MPI_Init(&argc, &argv); + + int fenix_role, error; + MPI_Comm res_comm; + MPI_Info info; + MPI_Info_create(&info); + MPI_Info_set(info, "FENIX_RESUME_MODE", "NO_JUMP"); + MPI_Info_set(info, "FENIX_UNHANDLED_MODE", "NO_JUMP"); + Fenix_Init(&fenix_role, MPI_COMM_WORLD, &res_comm, &argc, &argv, 0, 0, info, &error); + + Fenix::register_exception_callback(); + + if(fenix_role == FENIX_ROLE_SURVIVOR_RANK){ + printf("FAILURE: longjmp instead of exception\n"); + status = 1; + } + + if (fenix_role == FENIX_ROLE_INITIAL_RANK) { + int rank; + MPI_Comm_rank(res_comm, &rank); + if(rank == 1) raise(SIGKILL); + + try { + MPI_Barrier(res_comm); + printf("FAILURE: barrier finished without fault\n"); + status = 1; + } catch (Fenix::CommException e){ + printf("SUCCESS: caught CommException\n"); + } + } + + Fenix_Finalize(); + MPI_Finalize(); + + return status; +}