diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1-GCC-10.3.0.eb b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1-GCC-10.3.0.eb index 57c38134baa..9d2b5068d87 100644 --- a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1-GCC-10.3.0.eb +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1-GCC-10.3.0.eb @@ -15,6 +15,8 @@ patches = [ 'OpenMPI-4.1.1_fix-bufferoverflow-in-common_ofi.patch', 'OpenMPI-4.0.6_remove-pmix-check-in-pmi-switch.patch', 'OpenMPI-4.1.0-1-pml-ucx-datatype-memleak.patch', + 'OpenMPI-4.1.1_build-with-internal-cuda-header.patch', + 'OpenMPI-4.1.1_opal-datatype-cuda-performance.patch', ] checksums = [ 'e24f7a778bd11a71ad0c14587a7f5b00e68a71aa5623e2157bafee3d44c07cda', # openmpi-4.1.1.tar.bz2 @@ -24,10 +26,16 @@ checksums = [ '8acee6c9b2b4bf12873a39b85a58ca669de78e90d26186e52f221bb4853abc4d', # OpenMPI-4.1.0-1-pml-ucx-datatype-memleak.patch 'a94a74b174ce783328abfd3656ff5196b89ef4c819fe4c8b8a0f1277123e76ea', + # OpenMPI-4.1.1_build-with-internal-cuda-header.patch + '1ceb82b19f62da2525357debaae694d7751b6352adae7ffa55c71e19a4d7101c', + # OpenMPI-4.1.1_opal-datatype-cuda-performance.patch + 'b767c7166cf0b32906132d58de5439c735193c9fd09ec3c5c11db8d5fa68750e', ] builddependencies = [ ('pkg-config', '0.29.2'), + ('Perl', '5.32.1'), + ('Autotools', '20210128'), ] dependencies = [ @@ -39,6 +47,11 @@ dependencies = [ ('PMIx', '3.2.3'), ] +preconfigopts = './autogen.pl --force && ' + +# CUDA related patches and custom configure option can be removed if CUDA support isn't wanted. +configopts = '--with-cuda=internal ' + # disable MPI1 compatibility for now, see what breaks... # configopts = '--enable-mpi1-compatibility ' diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1-GCC-11.2.0.eb b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1-GCC-11.2.0.eb index 593e8e3be27..24a5ae59bc0 100644 --- a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1-GCC-11.2.0.eb +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1-GCC-11.2.0.eb @@ -14,6 +14,8 @@ patches = [ 'OpenMPI-4.1.1_opal-pmix-package-rank.patch', 'OpenMPI-4.1.1_pmix3x-protection.patch', 'OpenMPI-4.1.0-1-pml-ucx-datatype-memleak.patch', + 'OpenMPI-4.1.1_build-with-internal-cuda-header.patch', + 'OpenMPI-4.1.1_opal-datatype-cuda-performance.patch', ] checksums = [ 'e24f7a778bd11a71ad0c14587a7f5b00e68a71aa5623e2157bafee3d44c07cda', # openmpi-4.1.1.tar.bz2 @@ -25,10 +27,16 @@ checksums = [ '384ef9f1fa803b0d71dae2ec0748d0f20295992437532afedf21478bda164ff8', # OpenMPI-4.1.1_pmix3x-protection.patch # OpenMPI-4.1.0-1-pml-ucx-datatype-memleak.patch 'a94a74b174ce783328abfd3656ff5196b89ef4c819fe4c8b8a0f1277123e76ea', + # OpenMPI-4.1.1_build-with-internal-cuda-header.patch + '1ceb82b19f62da2525357debaae694d7751b6352adae7ffa55c71e19a4d7101c', + # OpenMPI-4.1.1_opal-datatype-cuda-performance.patch + 'b767c7166cf0b32906132d58de5439c735193c9fd09ec3c5c11db8d5fa68750e', ] builddependencies = [ ('pkg-config', '0.29.2'), + ('Perl', '5.34.0'), + ('Autotools', '20210726'), ] dependencies = [ @@ -40,6 +48,11 @@ dependencies = [ ('PMIx', '4.1.0'), ] +preconfigopts = './autogen.pl --force && ' + +# CUDA related patches and custom configure option can be removed if CUDA support isn't wanted. +configopts = '--with-cuda=internal ' + # disable MPI1 compatibility for now, see what breaks... # configopts = '--enable-mpi1-compatibility ' diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1_build-with-internal-cuda-header.patch b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1_build-with-internal-cuda-header.patch new file mode 100644 index 00000000000..9c5945b7de2 --- /dev/null +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1_build-with-internal-cuda-header.patch @@ -0,0 +1,93 @@ +Allow building Open MPI with --with-cuda=internal, by providing an +internal minimal cuda.h header file. This eliminate the CUDA +(build)dependency; as long as the runtime CUDA version is 8.0+, +libcuda.so will be dlopen'ed and used successfully. + +Author: Bart Oldeman +--- openmpi-4.1.3.orig/config/opal_check_cuda.m4 2022-03-31 16:04:13.000000000 +0000 ++++ openmpi-4.1.3/config/opal_check_cuda.m4 2022-05-04 17:37:57.576260311 +0000 +@@ -45,6 +45,12 @@ + # macro as that would error out after not finding it in the first directory. + # Note that anywhere CUDA aware code is in the Open MPI repository requires + # us to make use of AC_REQUIRE to ensure this check has been done. ++opal_check_cuda_internal="" ++AS_IF([test "$with_cuda" = "internal"], ++ [AC_MSG_RESULT([internal support requested]) ++ with_cuda="${OPAL_TOP_SRCDIR}/opal/mca/common/cuda/cuda" ++ opal_check_cuda_internal=" (internal)" ++ ]) + AS_IF([test "$with_cuda" = "no" || test "x$with_cuda" = "x"], + [opal_check_cuda_happy="no" + AC_MSG_RESULT([not set (--with-cuda=$with_cuda)])], +@@ -124,7 +130,7 @@ + CUDA_SUPPORT=0 + fi + +-OPAL_SUMMARY_ADD([[Miscellaneous]],[[CUDA support]],[opal_cuda], [$opal_check_cuda_happy]) ++OPAL_SUMMARY_ADD([[Miscellaneous]],[[CUDA support]],[opal_cuda], [$opal_check_cuda_happy$opal_check_cuda_internal]) + + AM_CONDITIONAL([OPAL_cuda_support], [test "x$CUDA_SUPPORT" = "x1"]) + AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT],$CUDA_SUPPORT, +--- openmpi-4.1.3.orig/opal/mca/common/cuda/cuda/cuda.h 1970-01-01 00:00:00.000000000 +0000 ++++ openmpi-4.1.3/opal/mca/common/cuda/cuda/cuda.h 2022-05-04 18:52:14.991300184 +0000 +@@ -0,0 +1,60 @@ ++/* This header provides minimal parts of the CUDA Driver API, without having to ++ rely on the proprietary CUDA toolkit. ++ ++ References (to avoid copying from NVidia's proprietary cuda.h): ++ https://github.com/gcc-mirror/gcc/blob/master/include/cuda/cuda.h ++ https://github.com/Theano/libgpuarray/blob/master/src/loaders/libcuda.h ++ https://github.com/CPFL/gdev/blob/master/cuda/driver/cuda.h ++ https://github.com/CudaWrangler/cuew/blob/master/include/cuew.h ++*/ ++ ++#define CUDA_VERSION 8000 ++ ++typedef void *CUcontext; ++typedef int CUdevice; ++#if defined(__LP64__) || defined(_WIN64) ++typedef unsigned long long CUdeviceptr; ++#else ++typedef unsigned CUdeviceptr; ++#endif ++typedef void *CUevent; ++typedef void *CUstream; ++ ++typedef enum { ++ CUDA_SUCCESS = 0, ++ CUDA_ERROR_NOT_INITIALIZED = 3, ++ CUDA_ERROR_DEINITIALIZED = 4, ++ CUDA_ERROR_ALREADY_MAPPED = 208, ++ CUDA_ERROR_NOT_READY = 600, ++} CUresult; ++ ++enum { ++ CU_EVENT_DISABLE_TIMING = 0x2, ++ CU_EVENT_INTERPROCESS = 0x4, ++}; ++ ++enum { ++ CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1, ++}; ++ ++typedef enum { ++ CU_POINTER_ATTRIBUTE_CONTEXT = 1, ++ CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, ++ CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, ++ CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, ++ CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, ++} CUpointer_attribute; ++ ++typedef enum { ++ CU_MEMORYTYPE_HOST = 0x01, ++} CUmemorytype; ++ ++typedef struct CUipcEventHandle CUipcEventHandle; ++ ++#define CU_IPC_HANDLE_SIZE 64 ++typedef struct CUipcMemHandle_st { ++ char reserved[CU_IPC_HANDLE_SIZE]; ++} CUipcMemHandle; ++ ++CUresult cuPointerGetAttributes (unsigned int numAttributes, ++ CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1_opal-datatype-cuda-performance.patch b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1_opal-datatype-cuda-performance.patch new file mode 100644 index 00000000000..8c1a1f8ec35 --- /dev/null +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-4.1.1_opal-datatype-cuda-performance.patch @@ -0,0 +1,456 @@ +If Open MPI is built with support for CUDA there's a small +(up to 10%) performance penalty for small messages due to overhead +in the datatype memory copy functionality. + +This eliminates most of this overhead as follows: +1. Seperate compilation of CUDA code paths in pack/unpack routines + instead of runtime checks in inner loops, similar to the existing + checksum functionality. +2. Expose opal_cuda_enabled variable so it can be checked directly + in opal_datatype_copy_content_same_ddt() instead of calling a + function. +3. Eliminate cbmemcpy function pointer as it always points to + opal_cuda_memcpy(), and a direct call is cheaper. + +Signed off by Bart Oldeman + +diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am +index daaaa8e4b0..ef2da1cd81 100644 +--- a/opal/datatype/Makefile.am ++++ b/opal/datatype/Makefile.am +@@ -44,6 +44,11 @@ noinst_LTLIBRARIES = \ + # these sources will be compiled with the special -D + libdatatype_reliable_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c + libdatatype_reliable_la_CFLAGS = -DCHECKSUM $(AM_CFLAGS) ++if OPAL_cuda_support ++libdatatype_gpu_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c ++libdatatype_gpu_la_CFLAGS = -DOPAL_DATATYPE_PACK_UNPACK_GPU $(AM_CFLAGS) ++noinst_LTLIBRARIES += libdatatype_gpu.la ++endif + + # these sources will be compiled with the normal CFLAGS only + libdatatype_la_SOURCES = \ +@@ -69,6 +74,9 @@ libdatatype_la_SOURCES = \ + opal_datatype_unpack.c + + libdatatype_la_LIBADD = libdatatype_reliable.la ++if OPAL_cuda_support ++libdatatype_la_LIBADD += libdatatype_gpu.la ++endif + + # Conditionally install the header files + if WANT_INSTALL_HEADERS +diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c +index 3931d99d17..33aebe2612 100644 +--- a/opal/datatype/opal_convertor.c ++++ b/opal/datatype/opal_convertor.c +@@ -40,8 +40,6 @@ + #include "opal/datatype/opal_convertor_internal.h" + #if OPAL_CUDA_SUPPORT + #include "opal/datatype/opal_datatype_cuda.h" +-#define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \ +- CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) + #endif + + static void opal_convertor_construct( opal_convertor_t* convertor ) +@@ -51,9 +49,6 @@ static void opal_convertor_construct( opal_convertor_t* convertor ) + convertor->partial_length = 0; + convertor->remoteArch = opal_local_arch; + convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED; +-#if OPAL_CUDA_SUPPORT +- convertor->cbmemcpy = &opal_cuda_memcpy; +-#endif + } + + +@@ -241,11 +236,7 @@ int32_t opal_convertor_pack( opal_convertor_t* pConv, + if( OPAL_LIKELY(NULL == iov[i].iov_base) ) + iov[i].iov_base = (IOVBASE_TYPE *) base_pointer; + else +-#if OPAL_CUDA_SUPPORT +- MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv ); +-#else + MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len ); +-#endif + pending_length -= iov[i].iov_len; + base_pointer += iov[i].iov_len; + } +@@ -258,11 +249,7 @@ complete_contiguous_data_pack: + if( OPAL_LIKELY(NULL == iov[i].iov_base) ) + iov[i].iov_base = (IOVBASE_TYPE *) base_pointer; + else +-#if OPAL_CUDA_SUPPORT +- MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv ); +-#else + MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len ); +-#endif + pConv->bConverted = pConv->local_size; + *out_size = i + 1; + pConv->flags |= CONVERTOR_COMPLETED; +@@ -296,11 +283,7 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv, + if( iov[i].iov_len >= pending_length ) { + goto complete_contiguous_data_unpack; + } +-#if OPAL_CUDA_SUPPORT +- MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv ); +-#else + MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len ); +-#endif + pending_length -= iov[i].iov_len; + base_pointer += iov[i].iov_len; + } +@@ -310,11 +293,7 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv, + + complete_contiguous_data_unpack: + iov[i].iov_len = pending_length; +-#if OPAL_CUDA_SUPPORT +- MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv ); +-#else + MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len ); +-#endif + pConv->bConverted = pConv->local_size; + *out_size = i + 1; + pConv->flags |= CONVERTOR_COMPLETED; +@@ -530,7 +509,7 @@ size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor ) + \ + convertor->remote_size = convertor->local_size; \ + if( OPAL_LIKELY(convertor->remoteArch == opal_local_arch) ) { \ +- if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) && \ ++ if( !(convertor->flags & (CONVERTOR_WITH_CHECKSUM | CONVERTOR_CUDA)) && \ + ((convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) || \ + ((convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && (1 == count))) ) { \ + return OPAL_SUCCESS; \ +@@ -541,8 +520,8 @@ size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor ) + opal_convertor_compute_remote_size( convertor ); \ + assert( NULL != convertor->use_desc->desc ); \ + /* For predefined datatypes (contiguous) do nothing more */ \ +- /* if checksum is enabled then always continue */ \ +- if( ((convertor->flags & (CONVERTOR_WITH_CHECKSUM | OPAL_DATATYPE_FLAG_NO_GAPS)) \ ++ /* if checksum or cuda is enabled then always continue */ \ ++ if( ((convertor->flags & (CONVERTOR_WITH_CHECKSUM | CONVERTOR_CUDA | OPAL_DATATYPE_FLAG_NO_GAPS)) \ + == OPAL_DATATYPE_FLAG_NO_GAPS) && \ + ((convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) == \ + (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) { \ +@@ -592,7 +571,19 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor, + } + } + } else +-#endif /* defined(CHECKSUM) */ ++#elif OPAL_CUDA_SUPPORT ++ if (OPAL_UNLIKELY(convertor->flags & CONVERTOR_CUDA)) { ++ if (OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS))) { ++ convertor->fAdvance = opal_unpack_general_gpu; ++ } else { ++ if (convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) { ++ convertor->fAdvance = opal_unpack_homogeneous_contig_gpu; ++ } else { ++ convertor->fAdvance = opal_generic_simple_unpack_gpu; ++ } ++ } ++ } else ++#endif /* defined(CHECKSUM) || OPAL_CUDA_SUPPORT */ + if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) { + convertor->fAdvance = opal_unpack_general; + } else { +@@ -636,7 +627,25 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor, + } + } + } else +-#endif /* defined(CHECKSUM) */ ++#elif OPAL_CUDA_SUPPORT ++ if (OPAL_UNLIKELY(convertor->flags & CONVERTOR_CUDA)) { ++ if (CONVERTOR_SEND_CONVERSION ++ == (convertor->flags & (CONVERTOR_SEND_CONVERSION | CONVERTOR_HOMOGENEOUS))) { ++ convertor->fAdvance = opal_pack_general_gpu; ++ } else { ++ if (datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) { ++ if (((datatype->ub - datatype->lb) == (ptrdiff_t) datatype->size) ++ || (1 >= convertor->count)) { ++ convertor->fAdvance = opal_pack_homogeneous_contig_gpu; ++ } else { ++ convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_gpu; ++ } ++ } else { ++ convertor->fAdvance = opal_generic_simple_pack_gpu; ++ } ++ } ++ } else ++#endif /* defined(CHECKSUM) || OPAL_CUDA_SUPPORT */ + if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) { + convertor->fAdvance = opal_pack_general; + } else { +@@ -694,9 +703,6 @@ int opal_convertor_clone( const opal_convertor_t* source, + destination->bConverted = source->bConverted; + destination->stack_pos = source->stack_pos; + } +-#if OPAL_CUDA_SUPPORT +- destination->cbmemcpy = source->cbmemcpy; +-#endif + return OPAL_SUCCESS; + } + +diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h +index b24d94c37b..53b6f0d526 100644 +--- a/opal/datatype/opal_convertor.h ++++ b/opal/datatype/opal_convertor.h +@@ -118,7 +118,6 @@ struct opal_convertor_t { + dt_stack_t static_stack[DT_STATIC_STACK_SIZE]; /**< local stack for small datatypes */ + + #if OPAL_CUDA_SUPPORT +- memcpy_fct_t cbmemcpy; /**< memcpy or cuMemcpy */ + void * stream; /**< CUstream for async copy */ + #endif + }; +diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c +index c70bdd24df..d7c10af3dc 100644 +--- a/opal/datatype/opal_datatype_copy.c ++++ b/opal/datatype/opal_datatype_copy.c +@@ -86,14 +86,6 @@ static size_t opal_datatype_memop_block_size = 128 * 1024; + #define MEM_OP opal_cuda_memmove + #include "opal_datatype_copy.h" + +-#define SET_CUDA_COPY_FCT(cuda_device_bufs, fct, copy_function) \ +- do { \ +- if (true == cuda_device_bufs) { \ +- fct = copy_function; \ +- } \ +- } while(0) +-#else +-#define SET_CUDA_COPY_FCT(cuda_device_bufs, fct, copy_function) + #endif + + int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, int32_t count, +@@ -102,10 +94,6 @@ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, in + ptrdiff_t extent; + int32_t (*fct)( const opal_datatype_t*, int32_t, char*, char*); + +-#if OPAL_CUDA_SUPPORT +- bool cuda_device_bufs = opal_cuda_check_bufs(destination_base, source_base); +-#endif +- + DO_DEBUG( opal_output( 0, "opal_datatype_copy_content_same_ddt( %p, %d, dst %p, src %p )\n", + (void*)datatype, count, (void*)destination_base, (void*)source_base ); ); + +@@ -122,20 +110,25 @@ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, in + extent = (datatype->true_ub - datatype->true_lb) + (count - 1) * (datatype->ub - datatype->lb); + + fct = non_overlap_copy_content_same_ddt; +- SET_CUDA_COPY_FCT(cuda_device_bufs, fct, non_overlap_cuda_copy_content_same_ddt); + if( destination_base < source_base ) { + if( (destination_base + extent) > source_base ) { + /* memmove */ + fct = overlap_copy_content_same_ddt; +- SET_CUDA_COPY_FCT(cuda_device_bufs, fct, overlap_cuda_copy_content_same_ddt); + } + } else { + if( (source_base + extent) > destination_base ) { + /* memmove */ + fct = overlap_copy_content_same_ddt; +- SET_CUDA_COPY_FCT(cuda_device_bufs, fct, overlap_cuda_copy_content_same_ddt); + } + } ++ ++#if OPAL_CUDA_SUPPORT ++ if (OPAL_UNLIKELY(opal_cuda_enabled) && opal_cuda_check_bufs(destination_base, source_base)) { ++ fct = (fct == non_overlap_copy_content_same_ddt ? ++ non_overlap_cuda_copy_content_same_ddt : overlap_cuda_copy_content_same_ddt); ++ } ++#endif ++ + return fct( datatype, count, destination_base, source_base ); + } + +diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c +index 7869f17e90..f3b3cef8da 100644 +--- a/opal/datatype/opal_datatype_cuda.c ++++ b/opal/datatype/opal_datatype_cuda.c +@@ -20,7 +20,7 @@ + + static bool initialized = false; + int opal_cuda_verbose = 0; +-static int opal_cuda_enabled = 0; /* Starts out disabled */ ++int opal_cuda_enabled = 1; /* Starts out enabled */ + static int opal_cuda_output = 0; + static void opal_cuda_support_init(void); + static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL; +@@ -48,10 +48,6 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf) + opal_cuda_support_init(); + } + +- /* This is needed to handle case where convertor is not fully initialized +- * like when trying to do a sendi with convertor on the statck */ +- convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy; +- + /* If not enabled, then nothing else to do */ + if (!opal_cuda_enabled) { + return; +@@ -192,6 +188,7 @@ static void opal_cuda_support_init(void) + + /* Callback into the common cuda initialization routine. This is only + * set if some work had been done already in the common cuda code.*/ ++ opal_cuda_enabled = 0; + if (NULL != common_cuda_initialization_function) { + if (0 == common_cuda_initialization_function(&ftable)) { + opal_cuda_enabled = 1; +diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h +index 2789320520..d512e24550 100644 +--- a/opal/datatype/opal_datatype_cuda.h ++++ b/opal/datatype/opal_datatype_cuda.h +@@ -30,4 +30,6 @@ void* opal_cuda_memmove(void * dest, void * src, size_t size); + void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)); + void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream); + ++extern int opal_cuda_enabled; ++ + #endif +diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c +index b4e03a9bea..f42e292e0b 100644 +--- a/opal/datatype/opal_datatype_pack.c ++++ b/opal/datatype/opal_datatype_pack.c +@@ -45,6 +45,11 @@ + #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps_checksum + #define opal_generic_simple_pack_function opal_generic_simple_pack_checksum + #define opal_pack_general_function opal_pack_general_checksum ++#elif defined(OPAL_DATATYPE_PACK_UNPACK_GPU) ++#define opal_pack_homogeneous_contig_function opal_pack_homogeneous_contig_gpu ++#define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps_gpu ++#define opal_generic_simple_pack_function opal_generic_simple_pack_gpu ++#define opal_pack_general_function opal_pack_general_gpu + #else + #define opal_pack_homogeneous_contig_function opal_pack_homogeneous_contig + #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps +diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h +index 2a2e79180d..7fbf0c88e2 100644 +--- a/opal/datatype/opal_datatype_pack.h ++++ b/opal/datatype/opal_datatype_pack.h +@@ -19,11 +19,12 @@ + + #include "opal_config.h" + +-#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT ++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU) + /* Make use of existing macro to do CUDA style memcpy */ ++#include "opal_datatype_cuda.h" + #undef MEMCPY_CSUM + #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ +- CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) ++ opal_cuda_memcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) + #endif + + /** +diff --git a/opal/datatype/opal_datatype_prototypes.h b/opal/datatype/opal_datatype_prototypes.h +index 668397112b..111f74f2a4 100644 +--- a/opal/datatype/opal_datatype_prototypes.h ++++ b/opal/datatype/opal_datatype_prototypes.h +@@ -39,6 +39,16 @@ OPAL_DECLSPEC int32_t + opal_unpack_general_checksum( opal_convertor_t* pConvertor, + struct iovec* iov, uint32_t* out_size, + size_t* max_data ); ++#if OPAL_CUDA_SUPPORT ++OPAL_DECLSPEC int32_t ++opal_pack_general_gpu( opal_convertor_t* pConvertor, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++OPAL_DECLSPEC int32_t ++opal_unpack_general_gpu( opal_convertor_t* pConvertor, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++#endif + + /* + * Now the internal functions +@@ -83,6 +93,28 @@ int32_t + opal_generic_simple_unpack_checksum( opal_convertor_t* pConvertor, + struct iovec* iov, uint32_t* out_size, + size_t* max_data ); ++#if OPAL_CUDA_SUPPORT ++int32_t ++opal_pack_homogeneous_contig_gpu( opal_convertor_t* pConv, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++int32_t ++opal_pack_homogeneous_contig_with_gaps_gpu( opal_convertor_t* pConv, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++int32_t ++opal_generic_simple_pack_gpu( opal_convertor_t* pConvertor, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++int32_t ++opal_unpack_homogeneous_contig_gpu( opal_convertor_t* pConv, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++int32_t ++opal_generic_simple_unpack_gpu( opal_convertor_t* pConvertor, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++#endif + + END_C_DECLS + +diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c +index 26a5810dc0..668b6624aa 100644 +--- a/opal/datatype/opal_datatype_unpack.c ++++ b/opal/datatype/opal_datatype_unpack.c +@@ -46,6 +46,10 @@ + #define opal_unpack_general_function opal_unpack_general_checksum + #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_checksum + #define opal_generic_simple_unpack_function opal_generic_simple_unpack_checksum ++#elif defined(OPAL_DATATYPE_PACK_UNPACK_GPU) ++#define opal_unpack_general_function opal_unpack_general_gpu ++#define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_gpu ++#define opal_generic_simple_unpack_function opal_generic_simple_unpack_gpu + #else + #define opal_unpack_general_function opal_unpack_general + #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig +@@ -204,10 +208,10 @@ opal_unpack_partial_predefined(opal_convertor_t *pConvertor, const dt_elem_desc_ + MEMCPY( temporary + start_position, partial_data, length ); + + /* Save the original content of the user memory */ +-#if OPAL_CUDA_SUPPORT ++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU) + /* In the case where the data is being unpacked from device memory, need to + * use the special host to device memory copy. */ +- pConvertor->cbmemcpy(saved_data, user_data, data_length, pConvertor ); ++ opal_cuda_memcpy(saved_data, user_data, data_length, pConvertor ); + #else + MEMCPY( saved_data, user_data, data_length ); + #endif +@@ -222,15 +226,15 @@ opal_unpack_partial_predefined(opal_convertor_t *pConvertor, const dt_elem_desc_ + + /* Rebuild the data by pulling back the unmodified bytes from the original + * content in the user memory. */ +-#if OPAL_CUDA_SUPPORT ++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU) + /* Need to copy the modified user_data again so we can see which + * bytes need to be converted back to their original values. */ + { + char resaved_data[16]; +- pConvertor->cbmemcpy(resaved_data, user_data, data_length, pConvertor ); ++ opal_cuda_memcpy(resaved_data, user_data, data_length, pConvertor ); + for(size_t i = 0; i < data_length; i++ ) { + if( unused_byte == resaved_data[i] ) +- pConvertor->cbmemcpy(&user_data[i], &saved_data[i], 1, pConvertor); ++ opal_cuda_memcpy(&user_data[i], &saved_data[i], 1, pConvertor); + } + } + #else +diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h +index 33db837882..4159a475fc 100644 +--- a/opal/datatype/opal_datatype_unpack.h ++++ b/opal/datatype/opal_datatype_unpack.h +@@ -19,11 +19,12 @@ + + #include "opal_config.h" + +-#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT ++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU) + /* Make use of existing macro to do CUDA style memcpy */ ++#include "opal_datatype_cuda.h" + #undef MEMCPY_CSUM + #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ +- CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) ++ opal_cuda_memcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) + #endif + + /**