-
Notifications
You must be signed in to change notification settings - Fork 762
{mpi}[GCC/11.3.0] OpenMPI v4.1.4 #15426
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
{mpi}[GCC/11.3.0] OpenMPI v4.1.4 #15426
Conversation
….0-GCCcore-11.3.0.eb, PMIx-4.1.2-GCCcore-11.3.0.eb, UCX-1.12.1-GCCcore-11.3.0.eb, OpenMPI-4.1.4rc1-GCC-11.3.0.eb
|
Test report by @SebastianAchilles |
This comment was marked as outdated.
This comment was marked as outdated.
|
Can we consider adding the patch in #14919 (comment) and ? |
|
@bartoldeman May have another patch to reduce the overhead of CUDA that this might introduce (see comment from @Micket) |
This comment was marked as outdated.
This comment was marked as outdated.
|
I think Bart was looking to get that performance patch merged upstream, so they would likely have some opinions on how to do it and how they like to have their defines. But of course, we don't have to care about that when patching all the versions that have already been released. I stole this from the thread (I hope it wasn't secret Bart!) diff -ur openmpi-4.1.1.orig/opal/datatype/opal_convertor.c openmpi-4.1.1/opal/datatype/opal_convertor.c
--- openmpi-4.1.1.orig/opal/datatype/opal_convertor.c 2021-04-24 13:28:07.000000000 -0400
+++ openmpi-4.1.1/opal/datatype/opal_convertor.c 2022-05-06 11:40:08.698454429 -0400
@@ -41,7 +41,7 @@
#if OPAL_CUDA_SUPPORT
#include "opal/datatype/opal_datatype_cuda.h"
#define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
- CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
+ opal_cuda_memcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
#endif
static void opal_convertor_construct( opal_convertor_t* convertor )
@@ -51,9 +51,6 @@
convertor->partial_length = 0;
convertor->remoteArch = opal_local_arch;
convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
-#if OPAL_CUDA_SUPPORT
- convertor->cbmemcpy = &opal_cuda_memcpy;
-#endif
}
@@ -694,9 +691,6 @@
destination->bConverted = source->bConverted;
destination->stack_pos = source->stack_pos;
}
-#if OPAL_CUDA_SUPPORT
- destination->cbmemcpy = source->cbmemcpy;
-#endif
return OPAL_SUCCESS;
}
diff -ur openmpi-4.1.1.orig/opal/datatype/opal_convertor.h openmpi-4.1.1/opal/datatype/opal_convertor.h
--- openmpi-4.1.1.orig/opal/datatype/opal_convertor.h 2021-04-24 13:28:07.000000000 -0400
+++ openmpi-4.1.1/opal/datatype/opal_convertor.h 2022-05-06 09:59:06.242836736 -0400
@@ -118,7 +118,6 @@
dt_stack_t static_stack[DT_STATIC_STACK_SIZE]; /**< local stack for small datatypes */
#if OPAL_CUDA_SUPPORT
- memcpy_fct_t cbmemcpy; /**< memcpy or cuMemcpy */
void * stream; /**< CUstream for async copy */
#endif
};
diff -ur openmpi-4.1.1.orig/opal/datatype/opal_datatype_cuda.c openmpi-4.1.1/opal/datatype/opal_datatype_cuda.c
--- openmpi-4.1.1.orig/opal/datatype/opal_datatype_cuda.c 2021-04-24 13:28:07.000000000 -0400
+++ openmpi-4.1.1/opal/datatype/opal_datatype_cuda.c 2022-05-06 10:26:38.659033919 -0400
@@ -48,10 +48,6 @@
opal_cuda_support_init();
}
- /* This is needed to handle case where convertor is not fully initialized
- * like when trying to do a sendi with convertor on the statck */
- convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy;
-
/* If not enabled, then nothing else to do */
if (!opal_cuda_enabled) {
return;
@@ -112,20 +108,17 @@
}
/*
- * With CUDA enabled, all contiguous copies will pass through this function.
- * Therefore, the first check is to see if the convertor is a GPU buffer.
+ * With CUDA enabled, all contiguous copies will pass through opal_cuda_memcpy which
+ * calls this function. Therefore, that function checks inline to see if the convertor
+ * is a GPU buffer.
* Note that if there is an error with any of the CUDA calls, the program
* aborts as there is no recovering.
*/
-void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
+void *opal_cuda_memcpy_gpu(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
{
int res;
- if (!(convertor->flags & CONVERTOR_CUDA)) {
- return memcpy(dest, src, size);
- }
-
if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
res = ftable.gpu_cu_memcpy_async(dest, (void *)src, size, convertor);
} else {
diff -ur openmpi-4.1.1.orig/opal/datatype/opal_datatype_cuda.h openmpi-4.1.1/opal/datatype/opal_datatype_cuda.h
--- openmpi-4.1.1.orig/opal/datatype/opal_datatype_cuda.h 2021-04-24 13:28:07.000000000 -0400
+++ openmpi-4.1.1/opal/datatype/opal_datatype_cuda.h 2022-05-06 10:34:26.120375882 -0400
@@ -10,6 +10,8 @@
#ifndef _OPAL_DATATYPE_CUDA_H
#define _OPAL_DATATYPE_CUDA_H
+#include <string.h>
+
/* Structure to hold CUDA support functions that gets filled in when the
* common cuda code is initialized. This removes any dependency on <cuda.h>
* in the opal cuda datatype code. */
@@ -24,10 +26,24 @@
void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
bool opal_cuda_check_bufs(char *dest, char *src);
bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor );
-void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
+void* opal_cuda_memcpy_gpu(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
void* opal_cuda_memcpy_sync(void * dest, const void * src, size_t size);
void* opal_cuda_memmove(void * dest, void * src, size_t size);
void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
+/*
+ * With CUDA enabled, all contiguous copies will pass through this function.
+ * Therefore, the first check is to see if the convertor is a GPU buffer.
+ */
+
+static inline void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
+{
+ if (OPAL_LIKELY(!(convertor->flags & CONVERTOR_CUDA))) {
+ return memcpy(dest, src, size);
+ }
+
+ return opal_cuda_memcpy_gpu(dest, src, size, convertor);
+}
+
#endif
diff -ur openmpi-4.1.1.orig/opal/datatype/opal_datatype_pack.h openmpi-4.1.1/opal/datatype/opal_datatype_pack.h
--- openmpi-4.1.1.orig/opal/datatype/opal_datatype_pack.h 2021-04-24 13:28:07.000000000 -0400
+++ openmpi-4.1.1/opal/datatype/opal_datatype_pack.h 2022-05-06 10:29:01.375528587 -0400
@@ -21,9 +21,10 @@
#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
/* Make use of existing macro to do CUDA style memcpy */
+#include "opal/datatype/opal_datatype_cuda.h"
#undef MEMCPY_CSUM
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
- CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
+ opal_cuda_memcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
#endif
/**
diff -ur openmpi-4.1.1.orig/opal/datatype/opal_datatype_unpack.c openmpi-4.1.1/opal/datatype/opal_datatype_unpack.c
--- openmpi-4.1.1.orig/opal/datatype/opal_datatype_unpack.c 2021-04-24 13:28:07.000000000 -0400
+++ openmpi-4.1.1/opal/datatype/opal_datatype_unpack.c 2022-05-06 10:16:50.547111046 -0400
@@ -41,6 +41,9 @@
#include "opal/datatype/opal_datatype_checksum.h"
#include "opal/datatype/opal_datatype_unpack.h"
#include "opal/datatype/opal_datatype_prototypes.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif
#if defined(CHECKSUM)
#define opal_unpack_general_function opal_unpack_general_checksum
@@ -207,7 +210,7 @@
#if OPAL_CUDA_SUPPORT
/* In the case where the data is being unpacked from device memory, need to
* use the special host to device memory copy. */
- pConvertor->cbmemcpy(saved_data, user_data, data_length, pConvertor );
+ opal_cuda_memcpy(saved_data, user_data, data_length, pConvertor );
#else
MEMCPY( saved_data, user_data, data_length );
#endif
@@ -227,10 +230,10 @@
* bytes need to be converted back to their original values. */
{
char resaved_data[16];
- pConvertor->cbmemcpy(resaved_data, user_data, data_length, pConvertor );
+ opal_cuda_memcpy(resaved_data, user_data, data_length, pConvertor );
for(size_t i = 0; i < data_length; i++ ) {
if( unused_byte == resaved_data[i] )
- pConvertor->cbmemcpy(&user_data[i], &saved_data[i], 1, pConvertor);
+ opal_cuda_memcpy(&user_data[i], &saved_data[i], 1, pConvertor);
}
}
#else
diff -ur openmpi-4.1.1.orig/opal/datatype/opal_datatype_unpack.h openmpi-4.1.1/opal/datatype/opal_datatype_unpack.h
--- openmpi-4.1.1.orig/opal/datatype/opal_datatype_unpack.h 2021-04-24 13:28:07.000000000 -0400
+++ openmpi-4.1.1/opal/datatype/opal_datatype_unpack.h 2022-05-06 10:29:13.791484625 -0400
@@ -21,9 +21,10 @@
#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
/* Make use of existing macro to do CUDA style memcpy */
+#include "opal/datatype/opal_datatype_cuda.h"
#undef MEMCPY_CSUM
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
- CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
+ opal_cuda_memcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
#endif
/** |
|
yes I have a patch open here, but need to follow through: open-mpi/ompi#10364 The performance difference isn't enormous, up to 10% on very small messages with MPI_Alltoall (which I think are rare in the first place). |
This comment was marked as outdated.
This comment was marked as outdated.
|
Test report by @jfgrimm |
|
Test report by @branfosj |
|
Test report by @branfosj |
|
Test report by @branfosj |
|
Test report by @branfosj |
|
@boegelbot please test @ generoso |
|
@bartoldeman: Request for testing this PR well received on login1 PR test command '
Test results coming soon (I hope)... - notification for comment with ID 1139532313 processed Message to humans: this is just bookkeeping information for me, |
…asyconfigs into 20220504130632_new_pr_libevent2112
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
|
Test report by @boegelbot |
|
Test report by @boegel |
|
Test report by @boegel |
|
@boegelbot please test @ jsc-zen2 |
|
@SebastianAchilles: Request for testing this PR well received on jsczen2l1.int.jsc-zen2.easybuild-test.cluster PR test command '
Test results coming soon (I hope)... - notification for comment with ID 1139659333 processed Message to humans: this is just bookkeeping information for me, |
|
Test report by @boegelbot |
|
Going in, thanks @jfgrimm! |
Draft OpenMPI w/ GCC 11.3.0, for latest release candidate. Will update once it is released.
depends on:
(created using
eb --new-pr)