Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ services:
-DBUILD_TESTING=ON \
-DMPIEXEC_PREFLAGS="--allow-run-as-root;--map-by;:oversubscribe" && \
make -j

ENV OMPI_MCA_coll=^han
ENV OMPI_MCA_btl=tcp,sm,self

WORKDIR /fenix/build
ENTRYPOINT ["/entrypoint.sh"]
Expand Down
2 changes: 2 additions & 0 deletions include/fenix.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
#include <setjmp.h>

#if defined(c_plusplus) || defined(__cplusplus)
#include "fenix.hpp"

extern "C" {
#endif

Expand Down
89 changes: 89 additions & 0 deletions include/fenix.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
//@HEADER
// ************************************************************************
//
//
// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _|
// _| _| _|_| _| _| _| _|
// _|_|_| _|_|_| _| _| _| _| _|
// _| _| _| _|_| _| _| _|
// _| _|_|_|_| _| _| _|_|_| _| _|
//
//
//
//
// Copyright (C) 2016 Rutgers University and Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
//
// ************************************************************************
//@HEADER
*/


#ifndef __FENIX_HPP__
#define __FENIX_HPP__

#include <mpi.h>
#include <functional>
#include "fenix.h"
#include "fenix_exception.hpp"

/**
* @brief As the C-style callback, but accepts an std::function and does not use the void* pointer.
*
* @param[in] callback The function to register.
*
* @returnstatus
*/
int Fenix_Callback_register(std::function<void(MPI_Comm, int)> callback);

namespace Fenix {

/**
* @brief Registers a callback that throws a CommException
*
* This means no longjmp will occur, and instead applications
* will continue from their try-catch error handler.
*
* @returnstatus
*/
int register_exception_callback();

} // namespace Fenix

#endif
74 changes: 74 additions & 0 deletions include/fenix_exception.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
//@HEADER
// ************************************************************************
//
//
// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _|
// _| _| _|_| _| _| _| _|
// _|_|_| _|_|_| _| _| _| _| _|
// _| _| _| _|_| _| _| _|
// _| _|_|_|_| _| _| _|_|_| _| _|
//
//
//
//
// Copyright (C) 2016 Rutgers University and Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
//
// ************************************************************************
//@HEADER
*/

#ifndef FENIX_EXCEPTION_HPP
#define FENIX_EXCEPTION_HPP

#include <mpi.h>
#include <exception>

namespace Fenix {

struct CommException : public std::exception {
MPI_Comm repaired_comm;
const int fenix_err;
CommException(MPI_Comm comm, int err) :
repaired_comm(comm), fenix_err(err) { };
};

} // namespace Fenix

#endif
5 changes: 3 additions & 2 deletions include/fenix_ext.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
#define __FENIX_EXT_H__

#include <mpi.h>
#include <vector>
#include "fenix.h"
#include "fenix_opt.hpp"
#include "fenix_data_group.hpp"
Expand All @@ -77,7 +78,7 @@ typedef struct {

//enum FenixRankRole role; // Role of rank: initial, survivor or repair
int role; // Role of rank: initial, survivor or repair
int fenix_init_flag;
int fenix_init_flag = 0;

int fail_world_size;
int* fail_world;
Expand All @@ -86,7 +87,7 @@ typedef struct {
int *ret_role;
int *ret_error;

fenix_callback_list_t* callback_list; // singly linked list for user-defined Fenix callback functions
std::vector<fenix_callback_func> callbacks;
fenix_debug_opt_t options; // This is reserved to store the user options

MPI_Comm *world; // Duplicate of the MPI communicator provided by user
Expand Down
19 changes: 3 additions & 16 deletions include/fenix_process_recovery.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,21 +67,12 @@
#include <signal.h>

#include "fenix_init.h"
#include <functional>

#define __FENIX_RESUME_AT_INIT 0
#define __FENIX_RESUME_NO_JUMP 200

typedef void (*recover)( MPI_Comm, int, void *);

typedef struct fcouple {
recover x;
void *y;
} fenix_callback_func;

typedef struct __fenix_callback_list {
fenix_callback_func *callback;
struct __fenix_callback_list *next;
} fenix_callback_list_t;
using fenix_callback_func = std::function<void(MPI_Comm, int)>;

typedef struct __fenix_comm_list_elm {
struct __fenix_comm_list_elm *next;
Expand All @@ -98,16 +89,12 @@ int __fenix_create_new_world();

int __fenix_repair_ranks();

int __fenix_callback_register(void (*recover)(MPI_Comm, int, void *), void *);
int __fenix_callback_register(fenix_callback_func& recover);

int __fenix_callback_pop();

void __fenix_callback_push(fenix_callback_list_t **, fenix_callback_func *);

void __fenix_callback_invoke_all(int error);

int __fenix_callback_destroy(fenix_callback_list_t *callback_list);

int* __fenix_get_fail_ranks(int *, int, int);

int __fenix_spare_rank();
Expand Down
3 changes: 3 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ FILE(GLOB Fenix_HEADERS ${CMAKE_SOURCE_DIR}/include/*.h*)

set (Fenix_SOURCES
fenix.cpp
fenix_exception.cpp
fenix_opt.cpp
fenix_process_recovery.cpp
fenix_util.cpp
Expand All @@ -31,6 +32,8 @@ globals.cpp

add_library( fenix STATIC ${Fenix_SOURCES})

target_compile_features(fenix PRIVATE cxx_std_17)

target_link_libraries(fenix PUBLIC MPI::MPI_CXX)

target_include_directories(fenix
Expand Down
10 changes: 8 additions & 2 deletions src/fenix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,19 @@
#include "fenix_process_recovery.hpp"
#include "fenix_util.hpp"
#include "fenix_ext.hpp"
#include "fenix.h"
#include "fenix.hpp"

const Fenix_Data_subset FENIX_DATA_SUBSET_FULL = {0, NULL, NULL, NULL, 0, __FENIX_SUBSET_FULL};
const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY = {0, NULL, NULL, NULL, 0, __FENIX_SUBSET_EMPTY};

int Fenix_Callback_register(std::function<void(MPI_Comm, int)> callback){
return __fenix_callback_register(callback);
}

int Fenix_Callback_register(void (*recover)(MPI_Comm, int, void *), void *callback_data) {
return __fenix_callback_register(recover, callback_data);
return Fenix_Callback_register([recover, callback_data](MPI_Comm comm, int fenix_error){
recover(comm, fenix_error, callback_data);
});
}

int Fenix_Callback_pop() {
Expand Down
64 changes: 10 additions & 54 deletions src/fenix_callbacks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,71 +65,27 @@
#include <mpi.h>


int __fenix_callback_register(void (*recover)(MPI_Comm, int, void *), void *callback_data)
int __fenix_callback_register(fenix_callback_func& recover)
{
int error_code = FENIX_SUCCESS;
if (fenix.fenix_init_flag) {
fenix_callback_func *fp = (fenix_callback_func *) s_malloc(sizeof(fenix_callback_func));
fp->x = recover;
fp->y = callback_data;
__fenix_callback_push( &fenix.callback_list, fp);
} else {
error_code = FENIX_ERROR_UNINITIALIZED;
}
return error_code;
if(!fenix.fenix_init_flag) return FENIX_ERROR_UNINITIALIZED;

fenix.callbacks.push_back(recover);

return FENIX_SUCCESS;
}

int __fenix_callback_pop(){
if(!fenix.fenix_init_flag) return FENIX_ERROR_UNINITIALIZED;
if(fenix.callback_list == NULL) return FENIX_ERROR_CALLBACK_NOT_REGISTERED;

fenix_callback_list_t* old_head = fenix.callback_list;
fenix.callback_list = old_head->next;
if(fenix.callbacks.empty()) return FENIX_ERROR_CALLBACK_NOT_REGISTERED;

free(old_head->callback);
free(old_head);
fenix.callbacks.pop_back();

return FENIX_SUCCESS;
}

void __fenix_callback_invoke_all(int error)
{
fenix_callback_list_t *current = fenix.callback_list;
while (current != NULL) {
(current->callback->x)((MPI_Comm) fenix.new_world, error,
(void *) current->callback->y);
current = current->next;
}
}

void __fenix_callback_push(fenix_callback_list_t **head, fenix_callback_func *fp)
{
fenix_callback_list_t *callback = (fenix_callback_list_t *) malloc(sizeof(fenix_callback_list_t));
callback->callback = fp;
callback->next = *head;
*head = callback;
}

int __fenix_callback_destroy(fenix_callback_list_t *callback_list)
{
int error_code = FENIX_SUCCESS;

if ( fenix.fenix_init_flag ) {

fenix_callback_list_t *current = callback_list;

while (current != NULL) {
fenix_callback_list_t *old;
old = current;
current = current->next;
free( old->callback );
free( old );
}

} else {
error_code = FENIX_ERROR_UNINITIALIZED;
for(auto it = fenix.callbacks.rbegin(); it != fenix.callbacks.rend(); it++){
(*it)(*fenix.user_world, error);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clarifying question: new_world and user_world are intended to be effectively the same communicator?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, new_world is just a duplicated user_world for using internally without interfering with the application. It should definitely be renamed for more clarity at some point.

}

return error_code;
}

14 changes: 14 additions & 0 deletions src/fenix_exception.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#include "fenix_exception.hpp"
#include "fenix.h"

namespace Fenix {

int register_exception_callback(){
return Fenix_Callback_register(
[](MPI_Comm repaired_comm, int fen_err){
throw CommException(repaired_comm, fen_err);
}
);
}

} // namespace Fenix
Loading