valassi
diff --git a/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h‎
Lines changed: 15 additions & 15 deletions b/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc‎
Lines changed: 4 additions & 3 deletions b/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h‎
Lines changed: 3 additions & 3 deletions b/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc‎
Lines changed: 2 additions & 1 deletion b/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc‎
Lines changed: 3 additions & 2 deletions b/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h‎
Lines changed: 2 additions & 2 deletions b/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h‎
Lines changed: 2 additions & 2 deletions b/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc‎
Lines changed: 5 additions & 5 deletions b/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h‎
Lines changed: 1 addition & 1 deletion b/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h‎
Lines changed: 79 additions & 0 deletions b/‎epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h‎
Lines changed: 79 additions & 0 deletions
@@ -22,7 +22,7 @@
 #include <memory>
 #include <type_traits>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -82,7 +82,7 @@ namespace mg5amcCpu
     Bridge& operator=( const Bridge& ) = delete;
     Bridge& operator=( Bridge&& ) = delete;
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     /**
      * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
      * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
@@ -149,7 +149,7 @@ namespace mg5amcCpu
     unsigned int m_nevt; // number of events
     int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
     int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
     mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
@@ -186,12 +186,12 @@ namespace mg5amcCpu
   // Forward declare transposition methods
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
 
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
 
   template<typename Tin, typename Tout>
   void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
@@ -208,7 +208,7 @@ namespace mg5amcCpu
   Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
     : m_nevt( nevtF )
     , m_nGoodHel( -1 )
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     , m_gputhreads( 256 )                  // default number of gpu threads
     , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
     , m_devMomentaF( m_nevt )
@@ -232,7 +232,7 @@ namespace mg5amcCpu
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -250,11 +250,11 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
     mg5amcCpu::CPPProcess process( /*verbose=*/false );
     m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // __CUDACC__
+#endif // MGONGPUCPP_GPUIMPL
     process.initProc( "../../Cards/param_card.dat" );
   }
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
   {
@@ -268,7 +268,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -283,14 +283,14 @@ namespace mg5amcCpu
     constexpr int neppM = MemoryAccessMomenta::neppM;
     if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
     {
-      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice );
     }
     else
     {
-      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice );
       const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
       //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
-      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
     }
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
@@ -333,7 +333,7 @@ namespace mg5amcCpu
   }
 #endif
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   template<typename FORTRANFPTYPE>
   void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
                                             const FORTRANFPTYPE* gs,
@@ -388,7 +388,7 @@ namespace mg5amcCpu
   // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
   //
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   template<typename Tin, typename Tout>
   __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
   {
 
@@ -5,6 +5,7 @@
 
 #include "BridgeKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMomenta.h"
 
 #include <sstream>
@@ -14,7 +15,7 @@ constexpr int npar = CPPProcess::npar; // #particles in total (external = initia
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -45,7 +46,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
 namespace mg5amcCpu
 {
 
@@ -96,7 +97,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
 
@@ -12,7 +12,7 @@
 #include "MatrixElementKernels.h"
 #include "MemoryBuffers.h"
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -49,7 +49,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef __CUDACC__
+#ifndef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
   class BridgeKernelHost final : public BridgeKernelBase
   {
@@ -89,7 +89,7 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
   class BridgeKernelDevice : public BridgeKernelBase
   {
 
@@ -4,12 +4,13 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
 #include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
 #include <cassert>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 
@@ -5,6 +5,7 @@
 
 #include "CrossSectionKernels.h"
 
+#include "GpuAbstraction.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessWeights.h"
 #include "MemoryBuffers.h"
@@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL )
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -185,7 +186,7 @@ namespace mg5amcCpu
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
 
 
@@ -13,7 +13,7 @@
 
 //============================================================================
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -96,7 +96,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   /*
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating the calculation of event statistics on a GPU device
   class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
   {
 
@@ -15,7 +15,7 @@
 //--------------------------------------------------------------------------
 
 // See https://stackoverflow.com/a/14038590
-#ifdef __CUDACC__ /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
 #define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
 inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
 {
@@ -29,7 +29,7 @@ inline void assertCuda( cudaError_t code, const char* file, int line, bool abort
 
 //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
   // Instantiate a CudaRuntime at the beginnining of the application's main to
 
@@ -3,7 +3,7 @@
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 
-#include "CudaRuntime.h"
+#include "GpuRuntime.h"
 #include "MemoryBuffers.h"
 #include "RandomNumberKernels.h"
 
@@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool
 }
 #endif /* clang-format on */
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_CUDACC
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
@@ -36,7 +36,7 @@ namespace mg5amcCpu
   {
     if( m_isOnDevice )
     {
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_CUDACC
       if( !m_rnarray.isOnDevice() )
         throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
 #else
@@ -114,7 +114,7 @@ namespace mg5amcCpu
     /*
     printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
     fptype* data = m_rnarray.data();
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() )
     {
       data = new fptype[m_rnarray.size()]();
@@ -123,7 +123,7 @@ namespace mg5amcCpu
 #endif
     for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
       printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
     if( m_rnarray.isOnDevice() ) delete[] data;
 #endif
     */
 
@@ -16,7 +16,7 @@
 #include <limits>
 #include <string>
 
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 #else
 namespace mg5amcCpu
 
@@ -0,0 +1,79 @@
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include <cassert>
+
+#ifdef MGONGPUCPP_GPUIMPL
+  #define MGONGPUCPP_CUDACC 1
+#endif
+
+#ifdef __HIPCC__
+  #include "hip/hip_runtime.h"
+  #define MGONGPUCPP_HIPCC 1
+#endif
+
+#ifdef MGONGPUCPP_CUDACC
+
+  // Defines correct compiler
+  #define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL
+
+  //--------------------------------------------------------------------------
+
+  #define gpuError_t cudaError_t
+  #define gpuPeekAtLastError cudaPeekAtLastError
+  #define gpuGetErrorString cudaGetErrorString
+  #define gpuSuccess cudaSuccess
+
+  #define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) )
+  #define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) )
+
+  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) )
+  #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+  #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) )
+
+  #define gpuFree(ptr) checkGpu( cudaFree(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) )
+
+  #define gpuSetDevice cudaSetDevice
+  #define gpuDeviceSynchronize cudaDeviceSynchronize
+  #define gpuDeviceReset cudaDeviceReset
+
+  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
+  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+
+//--------------------------------------------------------------------------
+
+#elif defined MGONGPUCPP_HIPCC
+
+  // Defines correct compiler
+  #define MGONGPUCPP_GPUIMPL __HCC__
+
+  //--------------------------------------------------------------------------
+
+  #define gpuError_t hipError_t
+  #define gpuPeekAtLastError hipPeekAtLastError
+  #define gpuGetErrorString hipGetErrorString
+  #define gpuSuccess hipSuccess
+
+  #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better
+  #define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) )
+
+  #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) )
+  #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+  #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+  #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) )
+
+  #define gpuFree(ptr) checkGpu( hipFree(ptr) )
+  #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) )
+
+  #define gpuSetDevice hipSetDevice
+  #define gpuDeviceSynchronize hipDeviceSynchronize
+  #define gpuDeviceReset hipDeviceReset
+
+  #define gpuLaunchKernel( kernel, blocks, threads, ...)                    kernel<<<blocks, threads>>> (__VA_ARGS__)
+  #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<<blocks, threads, sharedMem>>>(__VA_ARGS__)
+
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`//--------------------------------------------------------------------------`
`16`	`16`
`17`	`17`	`// See https://stackoverflow.com/a/14038590`
`18`		`-#ifdef __CUDACC__ /* clang-format off */`
	`18`	`+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */`
`19`	`19`	`#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }`
`20`	`20`	`inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )`
`21`	`21`	`{`
`@@ -29,7 +29,7 @@ inline void assertCuda( cudaError_t code, const char* file, int line, bool abort`
`29`	`29`
`30`	`30`	`//--------------------------------------------------------------------------`
`31`	`31`
`32`		`-#ifdef __CUDACC__`
	`32`	`+#ifdef MGONGPUCPP_GPUIMPL`
`33`	`33`	`namespace mg5amcGpu`
`34`	`34`	`{`
`35`	`35`	`// Instantiate a CudaRuntime at the beginnining of the application's main to`