Skip to content

Commit af81014

Browse files
authored
Determine MPI Data Types in col_on_comm() & dst_on_comm() to prevent displacements overflow. (Fix for #2156) (#2157)
Determine MPI Data Types in col_on_comm() & dst_on_comm() to prevent displacements overflow. TYPE: bug fix KEYWORDS: prevent displacements overflow in MPI_Gatherv() and MPI_Scatterv() operations SOURCE: Benjamin Kirk & Negin Sobhani (NSF NCAR / CISL) DESCRIPTION OF CHANGES: Problem: The MPI_Gatherv() and MPI_Scatterv() operations require integer displacements into the communications buffers. Historically everything is passed as an MPI_CHAR, causing these displacements to be larger than otherwise necessary. For large domain sizes this can cause the displace[] offsets to exceed the maximum int, wrapping to negative values. Solution: This change introduces additional error checking and then uses the function MPI_Type_match_size() (available since MPI-2.0) to determine a suitable MPI_Datatype given the input *typesize. The result then is that the displace[] offsets are in terms of data type extents, rather than bytes, and less likely to overflow. ISSUE: Fixes #2156 LIST OF MODIFIED FILES: M frame/collect_on_comm.c TESTS CONDUCTED: Failed cases run now. RELEASE NOTE: Determine MPI Data Types in col_on_comm() & dst_on_comm() to prevent displacements overflow.
1 parent 33ce70c commit af81014

File tree

1 file changed

+54
-19
lines changed

1 file changed

+54
-19
lines changed

frame/collect_on_comm.c

Lines changed: 54 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,11 @@
3636
# endif
3737
#endif
3838

39-
39+
4040
int col_on_comm ( int *, int *, void *, int *, void *, int *, int);
4141
int dst_on_comm ( int *, int *, void *, int *, void *, int *, int);
4242

43-
void
43+
void
4444
COLLECT_ON_COMM ( int * comm, int * typesize ,
4545
void * inbuf, int *ninbuf , void * outbuf, int * noutbuf )
4646
{
@@ -67,8 +67,9 @@ col_on_comm ( int * Fcomm, int * typesize ,
6767
int *displace ;
6868
int noutbuf_loc ;
6969
int root_task ;
70+
MPI_Datatype dtype;
71+
int ierr = -1;
7072
MPI_Comm *comm, dummy_comm ;
71-
int ierr ;
7273

7374
comm = &dummy_comm ;
7475
*comm = MPI_Comm_f2c( *Fcomm ) ;
@@ -90,28 +91,45 @@ col_on_comm ( int * Fcomm, int * typesize ,
9091
for ( p = 1 , displace[0] = 0 , noutbuf_loc = recvcounts[0] ; p < ntasks ; p++ ) {
9192
displace[p] = displace[p-1]+recvcounts[p-1] ;
9293
noutbuf_loc = noutbuf_loc + recvcounts[p] ;
94+
95+
/* check for overflow: displace is the partial sum of recvcounts, which can overflow for large problems. */
96+
if (displace[p] < 0) {
97+
#ifndef MS_SUA
98+
fprintf(stderr,"%s %d buffer offset overflow!!\n",__FILE__,__LINE__) ;
99+
fprintf(stderr," ---> p = %d,\n ---> displace[%d] = %d,\n ---> typesize = %d\n",
100+
p, p, displace[p], *typesize);
101+
#endif
102+
MPI_Abort(MPI_COMM_WORLD,1) ;
103+
}
93104
}
94105

95106
if ( noutbuf_loc > * noutbuf )
96107
{
97108
#ifndef MS_SUA
98109
fprintf(stderr,"FATAL ERROR: collect_on_comm: noutbuf_loc (%d) > noutbuf (%d)\n",
99-
noutbuf_loc , * noutbuf ) ;
110+
noutbuf_loc , * noutbuf ) ;
100111
fprintf(stderr,"WILL NOT perform the collection operation\n") ;
101112
#endif
102113
MPI_Abort(MPI_COMM_WORLD,1) ;
103114
}
104115

105-
/* multiply everything by the size of the type */
106-
for ( p = 0 ; p < ntasks ; p++ ) {
107-
displace[p] *= *typesize ;
108-
recvcounts[p] *= *typesize ;
116+
}
117+
118+
/* handle different sized data types appropriately. */
119+
ierr = MPI_Type_match_size (MPI_TYPECLASS_REAL, *typesize, &dtype);
120+
if (MPI_SUCCESS != ierr) {
121+
ierr = MPI_Type_match_size (MPI_TYPECLASS_INTEGER, *typesize, &dtype);
122+
if (MPI_SUCCESS != ierr) {
123+
#ifndef MS_SUA
124+
fprintf(stderr,"%s %d FATAL ERROR: unhandled typesize = %d!!\n", __FILE__,__LINE__,*typesize) ;
125+
#endif
126+
MPI_Abort(MPI_COMM_WORLD,1) ;
109127
}
110128
}
111129

112-
ierr = MPI_Gatherv( inbuf , *ninbuf * *typesize , MPI_CHAR ,
113-
outbuf , recvcounts , displace, MPI_CHAR ,
114-
root_task , *comm ) ;
130+
ierr = MPI_Gatherv( inbuf , *ninbuf, dtype,
131+
outbuf , recvcounts , displace, dtype,
132+
root_task , *comm ) ;
115133
#ifndef MS_SUA
116134
if ( ierr != 0 ) fprintf(stderr,"%s %d MPI_Gatherv returns %d\n",__FILE__,__LINE__,ierr ) ;
117135
#endif
@@ -152,6 +170,8 @@ dst_on_comm ( int * Fcomm, int * typesize ,
152170
int *displace ;
153171
int noutbuf_loc ;
154172
int root_task ;
173+
MPI_Datatype dtype;
174+
int ierr = -1;
155175
MPI_Comm *comm, dummy_comm ;
156176

157177
comm = &dummy_comm ;
@@ -171,18 +191,34 @@ dst_on_comm ( int * Fcomm, int * typesize ,
171191
for ( p = 1 , displace[0] = 0 , noutbuf_loc = sendcounts[0] ; p < ntasks ; p++ ) {
172192
displace[p] = displace[p-1]+sendcounts[p-1] ;
173193
noutbuf_loc = noutbuf_loc + sendcounts[p] ;
194+
195+
/* check for overflow: displace is the partial sum of sendcounts, which can overflow for large problems. */
196+
if ( (displace[p] < 0) || (noutbuf_loc < 0) ) {
197+
#ifndef MS_SUA
198+
fprintf(stderr,"%s %d buffer offset overflow!!\n",__FILE__,__LINE__) ;
199+
fprintf(stderr," ---> p = %d,\n ---> displace[%d] = %d,\n ---> noutbuf_loc = %d,\n ---> typesize = %d\n",
200+
p, p, displace[p], noutbuf_loc, *typesize);
201+
#endif
202+
MPI_Abort(MPI_COMM_WORLD,1) ;
203+
}
174204
}
205+
}
175206

176-
/* multiply everything by the size of the type */
177-
for ( p = 0 ; p < ntasks ; p++ ) {
178-
displace[p] *= *typesize ;
179-
sendcounts[p] *= *typesize ;
207+
/* handle different sized data types appropriately. */
208+
ierr = MPI_Type_match_size (MPI_TYPECLASS_REAL, *typesize, &dtype);
209+
if (MPI_SUCCESS != ierr) {
210+
ierr = MPI_Type_match_size (MPI_TYPECLASS_INTEGER, *typesize, &dtype);
211+
if (MPI_SUCCESS != ierr) {
212+
#ifndef MS_SUA
213+
fprintf(stderr,"%s %d FATAL ERROR: unhandled typesize = %d!!\n", __FILE__,__LINE__,*typesize) ;
214+
#endif
215+
MPI_Abort(MPI_COMM_WORLD,1) ;
180216
}
181217
}
182218

183-
MPI_Scatterv( inbuf , sendcounts , displace, MPI_CHAR ,
184-
outbuf , *noutbuf * *typesize , MPI_CHAR ,
185-
root_task , *comm ) ;
219+
MPI_Scatterv( inbuf, sendcounts, displace, dtype,
220+
outbuf, *noutbuf, dtype,
221+
root_task, *comm ) ;
186222

187223
free(sendcounts) ;
188224
free(displace) ;
@@ -241,4 +277,3 @@ rlim_ ()
241277
}
242278
#endif
243279
#endif
244-

0 commit comments

Comments
 (0)