Skip to content

Commit 6741186

Browse files
committed
[CODEGEN] Added GPU abstraction to CODEGEN
1 parent 63cf27c commit 6741186

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+589
-339
lines changed

epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
#include <memory>
2323
#include <type_traits>
2424

25-
#ifdef __CUDACC__
25+
#ifdef MGONGPUCPP_GPUIMPL
2626
namespace mg5amcGpu
2727
#else
2828
namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
8282
Bridge& operator=( const Bridge& ) = delete;
8383
Bridge& operator=( Bridge&& ) = delete;
8484

85-
#ifdef __CUDACC__
85+
#ifdef MGONGPUCPP_GPUIMPL
8686
/**
8787
* Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
8888
* (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
149149
unsigned int m_nevt; // number of events
150150
int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated)
151151

152-
#ifdef __CUDACC__
152+
#ifdef MGONGPUCPP_GPUIMPL
153153
int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
154154
int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified)
155155
mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
186186
// Forward declare transposition methods
187187
//
188188

189-
#ifdef __CUDACC__
189+
#ifdef MGONGPUCPP_GPUIMPL
190190

191191
template<typename Tin, typename Tout>
192192
__global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
193193

194-
#endif // __CUDACC__
194+
#endif // MGONGPUCPP_GPUIMPL
195195

196196
template<typename Tin, typename Tout>
197197
void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
208208
Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
209209
: m_nevt( nevtF )
210210
, m_nGoodHel( -1 )
211-
#ifdef __CUDACC__
211+
#ifdef MGONGPUCPP_GPUIMPL
212212
, m_gputhreads( 256 ) // default number of gpu threads
213213
, m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
214214
, m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
232232
{
233233
if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
234234
if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
235-
#ifdef __CUDACC__
235+
#ifdef MGONGPUCPP_GPUIMPL
236236
if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
237237
throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
238238
while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
250250
std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
251251
mg5amcCpu::CPPProcess process( /*verbose=*/false );
252252
m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
253-
#endif // __CUDACC__
253+
#endif // MGONGPUCPP_GPUIMPL
254254
process.initProc( "../../Cards/param_card.dat" );
255255
}
256256

257-
#ifdef __CUDACC__
257+
#ifdef MGONGPUCPP_GPUIMPL
258258
template<typename FORTRANFPTYPE>
259259
void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
260260
{
@@ -268,7 +268,7 @@ namespace mg5amcCpu
268268
}
269269
#endif
270270

271-
#ifdef __CUDACC__
271+
#ifdef MGONGPUCPP_GPUIMPL
272272
template<typename FORTRANFPTYPE>
273273
void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
274274
const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
283283
constexpr int neppM = MemoryAccessMomenta::neppM;
284284
if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
285285
{
286-
checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
286+
gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice );
287287
}
288288
else
289289
{
290-
checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
290+
gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice );
291291
const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
292292
//const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
293-
dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
293+
gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
294294
}
295295
if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
296296
{
@@ -333,7 +333,7 @@ namespace mg5amcCpu
333333
}
334334
#endif
335335

336-
#ifndef __CUDACC__
336+
#ifndef MGONGPUCPP_GPUIMPL
337337
template<typename FORTRANFPTYPE>
338338
void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
339339
const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
388388
// - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
389389
//
390390

391-
#ifdef __CUDACC__
391+
#ifdef MGONGPUCPP_GPUIMPL
392392
template<typename Tin, typename Tout>
393393
__global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
394394
{

epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#include "BridgeKernels.h"
77

8+
#include "GpuAbstraction.h"
89
#include "MemoryAccessMomenta.h"
910

1011
#include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
1415

1516
//============================================================================
1617

17-
#ifdef __CUDACC__
18+
#ifdef MGONGPUCPP_GPUIMPL
1819
namespace mg5amcGpu
1920
#else
2021
namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
4546

4647
//============================================================================
4748

48-
#ifndef __CUDACC__
49+
#ifndef MGONGPUCPP_GPUIMPL
4950
namespace mg5amcCpu
5051
{
5152

@@ -96,7 +97,7 @@ namespace mg5amcCpu
9697

9798
//============================================================================
9899

99-
#ifdef __CUDACC__
100+
#ifdef MGONGPUCPP_GPUIMPL
100101
namespace mg5amcGpu
101102
{
102103

epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#include "MatrixElementKernels.h"
1313
#include "MemoryBuffers.h"
1414

15-
#ifdef __CUDACC__
15+
#ifdef MGONGPUCPP_GPUIMPL
1616
namespace mg5amcGpu
1717
#else
1818
namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
4949

5050
//--------------------------------------------------------------------------
5151

52-
#ifndef __CUDACC__
52+
#ifndef MGONGPUCPP_GPUIMPL
5353
// A Bridge wrapper class encapsulating matrix element calculations on a CPU host
5454
class BridgeKernelHost final : public BridgeKernelBase
5555
{
@@ -89,7 +89,7 @@ namespace mg5amcCpu
8989

9090
//--------------------------------------------------------------------------
9191

92-
#ifdef __CUDACC__
92+
#ifdef MGONGPUCPP_GPUIMPL
9393
// A Bridge wrapper class encapsulating matrix element calculations on a GPU device
9494
class BridgeKernelDevice : public BridgeKernelBase
9595
{

epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44
// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
55

66
#include "CommonRandomNumbers.h"
7+
#include "GpuAbstraction.h"
78
#include "MemoryBuffers.h"
89
#include "RandomNumberKernels.h"
910

1011
#include <cassert>
1112

12-
#ifdef __CUDACC__
13+
#ifdef MGONGPUCPP_GPUIMPL
1314
namespace mg5amcGpu
1415
#else
1516
namespace mg5amcCpu

epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#include "CrossSectionKernels.h"
77

8+
#include "GpuAbstraction.h"
89
#include "MemoryAccessMatrixElements.h"
910
#include "MemoryAccessWeights.h"
1011
#include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
7778

7879
//============================================================================
7980

80-
#ifdef __CUDACC__
81+
#ifdef MGONGPUCPP_GPUIMPL
8182
namespace mg5amcGpu
8283
#else
8384
namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
185186

186187
//============================================================================
187188

188-
#ifdef __CUDACC__
189+
#ifdef MGONGPUCPP_GPUIMPL
189190
namespace mg5amcGpu
190191
{
191192

epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
//============================================================================
1515

16-
#ifdef __CUDACC__
16+
#ifdef MGONGPUCPP_GPUIMPL
1717
namespace mg5amcGpu
1818
#else
1919
namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
9696
//--------------------------------------------------------------------------
9797

9898
/*
99-
#ifdef __CUDACC__
99+
#ifdef MGONGPUCPP_GPUIMPL
100100
// A class encapsulating the calculation of event statistics on a GPU device
101101
class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
102102
{

epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
//--------------------------------------------------------------------------
1616

1717
// See https://stackoverflow.com/a/14038590
18-
#ifdef __CUDACC__ /* clang-format off */
18+
#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
1919
#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
2020
inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
2121
{
@@ -29,7 +29,7 @@ inline void assertCuda( cudaError_t code, const char* file, int line, bool abort
2929

3030
//--------------------------------------------------------------------------
3131

32-
#ifdef __CUDACC__
32+
#ifdef MGONGPUCPP_GPUIMPL
3333
namespace mg5amcGpu
3434
{
3535
// Instantiate a CudaRuntime at the beginnining of the application's main to

epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
44
// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
55

6-
#include "CudaRuntime.h"
6+
#include "GpuRuntime.h"
77
#include "MemoryBuffers.h"
88
#include "RandomNumberKernels.h"
99

@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
2222
}
2323
#endif /* clang-format on */
2424

25-
#ifdef __CUDACC__
25+
#ifdef MGONGPUCPP_CUDACC
2626
namespace mg5amcGpu
2727
#else
2828
namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
3636
{
3737
if( m_isOnDevice )
3838
{
39-
#ifdef __CUDACC__
39+
#ifdef MGONGPUCPP_CUDACC
4040
if( !m_rnarray.isOnDevice() )
4141
throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
4242
#else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
114114
/*
115115
printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
116116
fptype* data = m_rnarray.data();
117-
#ifdef __CUDACC__
117+
#ifdef MGONGPUCPP_GPUIMPL
118118
if( m_rnarray.isOnDevice() )
119119
{
120120
data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
123123
#endif
124124
for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
125125
printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
126-
#ifdef __CUDACC__
126+
#ifdef MGONGPUCPP_GPUIMPL
127127
if( m_rnarray.isOnDevice() ) delete[] data;
128128
#endif
129129
*/

epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include <limits>
1717
#include <string>
1818

19-
#ifdef __CUDACC__
19+
#ifdef MGONGPUCPP_GPUIMPL
2020
namespace mg5amcGpu
2121
#else
2222
namespace mg5amcCpu
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#ifndef MG5AMC_GPUABSTRACTION_H
2+
#define MG5AMC_GPUABSTRACTION_H 1
3+
4+
#include <cassert>
5+
6+
#ifdef MGONGPUCPP_GPUIMPL
7+
#define MGONGPUCPP_CUDACC 1
8+
#endif
9+
10+
#ifdef __HIPCC__
11+
#include "hip/hip_runtime.h"
12+
#define MGONGPUCPP_HIPCC 1
13+
#endif
14+
15+
#ifdef MGONGPUCPP_CUDACC
16+
17+
// Defines correct compiler
18+
#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
19+
20+
//--------------------------------------------------------------------------
21+
22+
#define gpuError_t cudaError_t
23+
#define gpuPeekAtLastError cudaPeekAtLastError
24+
#define gpuGetErrorString cudaGetErrorString
25+
#define gpuSuccess cudaSuccess
26+
27+
#define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) )
28+
#define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) )
29+
30+
#define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) )
31+
#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
32+
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
33+
#define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) )
34+
35+
#define gpuFree(ptr) checkGpu( cudaFree(ptr) )
36+
#define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
37+
38+
#define gpuSetDevice cudaSetDevice
39+
#define gpuDeviceSynchronize cudaDeviceSynchronize
40+
#define gpuDeviceReset cudaDeviceReset
41+
42+
#define gpuLaunchKernel( kernel, blocks, threads, ...) kernel<<<blocks, threads>>> (__VA_ARGS__)
43+
#define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
44+
45+
//--------------------------------------------------------------------------
46+
47+
#elif defined MGONGPUCPP_HIPCC
48+
49+
// Defines correct compiler
50+
#define MGONGPUCPP_GPUIMPL __HCC__
51+
52+
//--------------------------------------------------------------------------
53+
54+
#define gpuError_t hipError_t
55+
#define gpuPeekAtLastError hipPeekAtLastError
56+
#define gpuGetErrorString hipGetErrorString
57+
#define gpuSuccess hipSuccess
58+
59+
#define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
60+
#define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) )
61+
62+
#define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) )
63+
#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
64+
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
65+
#define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
66+
67+
#define gpuFree(ptr) checkGpu( hipFree(ptr) )
68+
#define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
69+
70+
#define gpuSetDevice hipSetDevice
71+
#define gpuDeviceSynchronize hipDeviceSynchronize
72+
#define gpuDeviceReset hipDeviceReset
73+
74+
#define gpuLaunchKernel( kernel, blocks, threads, ...) kernel<<<blocks, threads>>> (__VA_ARGS__)
75+
#define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
76+
77+
#endif
78+
79+
#endif // MG5AMC_GPUABSTRACTION_H

0 commit comments

Comments
 (0)